Explorar el Código

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	BiddingKG/dl/time/re_servicetime.py
fangjiasheng hace 3 años
padre
commit
0d5679a4e4
Se han modificado 36 ficheros con 12330 adiciones y 565 borrados
  1. 2 2
      BiddingKG.iml
  2. 23 74
      BiddingKG/app.py
  3. 4 0
      BiddingKG/dl/common/Utils.py
  4. 22 3
      BiddingKG/dl/entityLink/entityLink.py
  5. 0 0
      BiddingKG/dl/industry/__init__.py
  6. 10 0
      BiddingKG/dl/industry/ab.py
  7. 225 0
      BiddingKG/dl/industry/app.py
  8. 289 0
      BiddingKG/dl/industry/data_util.py
  9. 116 0
      BiddingKG/dl/industry/main.py
  10. 272 0
      BiddingKG/dl/industry/run_industry_server.py
  11. 389 168
      BiddingKG/dl/interface/Preprocessing.py
  12. 119 23
      BiddingKG/dl/interface/extract.py
  13. 144 31
      BiddingKG/dl/interface/getAttributes.py
  14. 12 4
      BiddingKG/dl/interface/modelFactory.py
  15. 750 100
      BiddingKG/dl/interface/predictor.py
  16. 1 1
      BiddingKG/dl/ratio/re_ratio.py
  17. 1 1
      BiddingKG/dl/table_head/models/model.py
  18. 1 1
      BiddingKG/dl/table_head/pre_process.py
  19. 1 27
      BiddingKG/dl/test/12.py
  20. 72 38
      BiddingKG/dl/test/test4.py
  21. 6 3
      BiddingKG/dl/test/测试整个要素提取流程.py
  22. 9 7
      BiddingKG/extract.app.json
  23. 2917 0
      BiddingKG/getAttributes.py
  24. 1517 10
      BiddingKG/maxcompute/1.py
  25. 894 25
      BiddingKG/maxcompute/documentDumplicate.py
  26. 51 17
      BiddingKG/maxcompute/documentMerge.py
  27. 46 1
      BiddingKG/maxcompute/enterpriseFix.py
  28. 2 2
      BiddingKG/maxcompute/evaluates.py
  29. 113 27
      BiddingKG/maxcompute/proposedBuildingProject.py
  30. 386 0
      BiddingKG/maxcompute/去重规则.md
  31. 6 0
      BiddingKG/maxcompute/重跑历史数据.md
  32. 3438 0
      BiddingKG/predictor.py
  33. 325 0
      BiddingKG/re_servicetime.py
  34. 17 0
      BiddingKG/restart_extract.sh
  35. 150 0
      BiddingKG/run_extract_server.py
  36. BIN
      BiddingKG/vocab_word.pk

+ 2 - 2
BiddingKG.iml

@@ -2,13 +2,13 @@
 <module type="JAVA_MODULE" version="4">
   <component name="FacetManager">
     <facet type="Python" name="Python">
-      <configuration sdkName="Python 3.5 (BiddingKG)" />
+      <configuration sdkName="Python 3.5 (dl_nlp)" />
     </facet>
   </component>
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
     <orderEntry type="jdk" jdkName="Remote Python 3.5.0 (sftp://yons@192.168.2.103:22/data/home/python/anaconda3/envs/dl_nlp/bin/python)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
-    <orderEntry type="library" name="Python 3.5 (BiddingKG) interpreter library" level="application" />
+    <orderEntry type="library" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
   </component>
 </module>

+ 23 - 74
BiddingKG/app.py

@@ -21,8 +21,8 @@ import inspect
 from threading import Thread
 import traceback
 
-os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"] = ""
 sys.path.append(os.path.abspath("."))
 
 #自定义jsonEncoder
@@ -68,68 +68,12 @@ class MyProcessor(allspark.BaseProcessor):
         _content = data.get("content","")
         _page_time = data.get("page_time","")
         data_res = ""
+
+        web_source_no = data.get("web_source_no","")
+        original_docchannel = data.get("original_docchannel","")
         try:
             if "content" in data:
-                content = data['content']
-                data_res  = predict(_doc_id,_content,_title,_page_time)
-                # log("get request of doc_id:%s"%(_doc_id))
-                # k = str(uuid.uuid4())
-                # cost_time = dict()
-                #
-                # start_time = time.time()
-                # list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[k,content,"",_doc_id,_title]],useselffool=True)
-                # log("get preprocessed done of doc_id%s"%(_doc_id))
-                # cost_time["preprocess"] = time.time()-start_time
-                # cost_time.update(_cost_time)
-                # '''
-                # for articles in list_articles:
-                #     print(articles.content)
-                #
-                # '''
-                # start_time = time.time()
-                # codeName = self.codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
-                # log("get codename done of doc_id%s"%(_doc_id))
-                # cost_time["codename"] = time.time()-start_time
-                #
-                # start_time = time.time()
-                # self.premPredict.predict(list_sentences,list_entitys)
-                #
-                # self.premPredict.predict(list_sentences,list_entitys)
-                # log("get prem done of doc_id%s"%(_doc_id))
-                # cost_time["prem"] = time.time()-start_time
-                # start_time = time.time()
-                # self.roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-                # # self.roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-                # cost_time["rule"] = time.time()-start_time
-                # start_time = time.time()
-                # self.epcPredict.predict(list_sentences,list_entitys)
-                # log("get epc done of doc_id%s"%(_doc_id))
-                # cost_time["person"] = time.time()-start_time
-                # start_time = time.time()
-                # entityLink.link_entitys(list_entitys)
-                # '''
-                # for list_entity in list_entitys:
-                #     for _entity in list_entity:
-                #         for _ent in _entity.linked_entitys:
-                #             print(_entity.entity_text,_ent.entity_text)
-                # '''
-                # prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
-                # log("get attributes done of doc_id%s"%(_doc_id))
-                # cost_time["attrs"] = time.time()-start_time
-                #
-                #
-                # '''
-                #
-                #
-                # for entitys in list_entitys:
-                #     for entity in entitys:
-                #         print(entity.entity_text,entity.entity_type,entity.sentence_index,entity.begin_index,entity.label,entity.values)
-                # '''
-                # #print(prem)
-                # data_res = predict(docid)
-                # data_res["cost_time"] = cost_time
-                # data_res["success"] = True
-                #return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+                data_res  = predict(_doc_id,_content,_title,_page_time,web_source_no,original_docchannel)
             else:
                 data_res = json.dumps({"success":False,"msg":"content not passed"})
 
@@ -170,10 +114,6 @@ class MyProcessor(allspark.BaseProcessor):
         data = data.decode("utf8")
         data = json.loads(data,encoding="utf8")
 
-        _doc_id = data.get("doc_id","")
-        _title = data.get("title","")
-        _content = data.get("content","")
-        _page_time = data.get("page_time","")
 
         status_code = 200
         list_result = []
@@ -187,23 +127,32 @@ class MyProcessor(allspark.BaseProcessor):
             status_code = 302#超时被kill
             data_res = json.dumps({"success":False,"msg":"timeout"})
         else:
-            status_code += int((time.time()-start_time)//self.timeOfType+1)
+            status_code += int((time.time()-start_time)%self.timeOfType+1)
             data_res = list_result[0]
         _resp = data_res
         # _resp = predict(doc_id=_doc_id,text=_content,title=_title,page_time=_page_time)
 
         return self.post_process(_resp),status_code
-        
+
+def getPort(argv):
+    port = 15030
+    for item in argv:
+        _l = str(item).split("port=")
+        if len(_l)>1:
+            port = int(_l[-1])
+            break
+    return port
         
 if __name__ == '__main__':
     # paramter worker_threads indicates concurrency of processing
     #本地运行
-    allspark.default_properties().put("rpc.keepalive", 120000)
-
-
-    runner = MyProcessor(worker_threads=5,worker_processes=1,endpoint="0.0.0.0:15030")
+    port = getPort(argv=sys.argv)
+    allspark.default_properties().put("rpc.keepalive", 250000)
+    allspark.default_properties().put("rpc.max_queue_size", 100)
+    log("port==%d"%(port))
+    #
+    #
+    runner = MyProcessor(worker_threads=5,worker_processes=1,endpoint="0.0.0.0:%d"%(port))
     #PAI平台运行
     # runner = MyProcessor()
-
-
     runner.run()

+ 4 - 0
BiddingKG/dl/common/Utils.py

@@ -450,6 +450,10 @@ def getUnifyMoney(money):
                 elif len(subMoneys[0])==1:
                     if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
                         result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
+                # subMoneys[0]中无金额单位,不可再拆分
+                elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
+                    subMoneys[0] = subMoneys[0][0]
+                    result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
                 else:
                     result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
                 if len(subMoneys)>1:

+ 22 - 3
BiddingKG/dl/entityLink/entityLink.py

@@ -69,7 +69,7 @@ place_list = get_place_list()
 place_pattern = "|".join(place_list)
 
 
-def link_entitys(list_entitys,on_value=0.81):
+def link_entitys(list_entitys,on_value=1):#on_value=0.81
     for list_entity in list_entitys:
         range_entity = []
         for _entity in list_entity:
@@ -121,7 +121,27 @@ def link_entitys(list_entitys,on_value=0.81):
                             _entity.entity_text = _ent.entity_text
                             used_linked_entitys.append(_ent)
                             # print(_entity.entity_text, _entity.if_dict_match, _ent.entity_text, _ent.if_dict_match)
+# 用于去重的标题
+def doctitle_refine(doctitle):
+    _doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|'
+                             r'交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|竞价|合同', '', doctitle)
+    return _doctitle_refine
+# 前100个公司实体
+def get_nlp_enterprise(list_entity):
+    nlp_enterprise = []
+    nlp_enterprise_attachment = []
+    max_num = 100
+    list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index))
+    for entity in list_entity:
+        if entity.entity_type in ['org','company']:
+            if not entity.in_attachment:
+                if entity.entity_text not in nlp_enterprise:
+                    nlp_enterprise.append(entity.entity_text)
+            else:
+                if entity.entity_text not in nlp_enterprise_attachment:
+                    nlp_enterprise_attachment.append(entity.entity_text)
 
+    return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num]
 
 def getEnterprisePath():
     filename = "LEGAL_ENTERPRISE.txt"
@@ -356,9 +376,8 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
 def isLegalEnterprise(name):
     is_legal = True
-    if re.search("^[省市区县]",name) is not None or re.search("^\**.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会|有限公司)$",name) or re.search("标段|标包|名称",name) is not None:
+    if re.search("^[省市区县]",name) is not None or re.search("^\**.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会|有限公司)$",name) or re.search("标段|标包|名称|联系人|联系方式|中标单位|中标人|测试单位|采购单位|采购人|代理人|代理机构|盖章|(主)",name) is not None:
         is_legal = False
-        print("is_legal:", name , is_legal)
     return is_legal
 
 def fix_LEGAL_ENTERPRISE():

+ 0 - 0
BiddingKG/dl/industry/__init__.py


+ 10 - 0
BiddingKG/dl/industry/ab.py

@@ -0,0 +1,10 @@
+
+
+import sys
+
+print(ord('0'))
+
+
+print(ord('2'))
+print(ord('0'))
+

+ 225 - 0
BiddingKG/dl/industry/app.py

@@ -0,0 +1,225 @@
+'''
+Created on 2019年12月3日
+
+@author: User
+'''
+
+import allspark
+import sys
+import os
+import json
+import logging
+import time
+import uuid
+sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))
+
+import tensorflow as tf
+from text_classifier_pai.main import Text_Classifier
+import numpy as np
+import ctypes
+import inspect
+from threading import Thread
+import traceback
+
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"] = ""
+sys.path.append(os.path.abspath("."))
+
+#自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32, 
+        np.float64)):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+def _async_raise(tid, exctype):
+    """raises the exception, performs cleanup if needed"""
+    tid = ctypes.c_long(tid)
+    if not inspect.isclass(exctype):
+        exctype = type(exctype)
+    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
+    if res == 0:
+        raise ValueError("invalid thread id")
+    elif res != 1:
+        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
+        raise SystemError("PyThreadState_SetAsyncExc failed")
+
+def stop_thread(thread):
+    _async_raise(thread.ident, SystemExit)
+
+
+class MyProcessor(allspark.BaseProcessor):
+    """ MyProcessor is a example
+        you can send mesage like this to predict
+        curl -v http://127.0.0.1:8080/api/predict/service_name -d '2 105'
+    """
+    def run_thread(self,data,list_result):
+        # data = data.decode("utf8")
+        # data = json.loads(data,encoding="utf8")
+        # print('准备处理请求:')
+        k = str(uuid.uuid4())
+        cost_time = dict()
+        if "doc_id" in data:
+            _doc_id = data['doc_id']
+        else:
+            _doc_id = ""
+        if "title" in data:
+            _title = data["title"]
+        else:
+            _title = ""
+        data_res = ""
+        try:
+            if "content" in data:
+                # logging.info("get request of doc_id:%s"%(_doc_id))
+                k = str(uuid.uuid4())
+                cost_time = dict()
+                content = data['content']
+
+                start_time = time.time()
+                # print('准备预处理,文章内容:',content[:20])
+                process, ids = self.classifier.process([content])
+                # logging.info("get preprocessed done of doc_id%s"%(_doc_id))
+                # print('预处理完成')
+                cost_time["preprocess"] = time.time()-start_time
+                # cost_time.update(_cost_time)
+
+                start_time = time.time()
+                # print('开始预测')
+                # with self.classifier.sess.graph.as_default():
+                logits, ids = self.classifier.predict(process, ids)
+                # print('预测完成')
+                # logging.info("get predict done of doc_id%s"%(_doc_id))
+                cost_time["predict"] = time.time()-start_time
+
+                start_time = time.time()
+                # print('准备提取结果')
+                result = self.classifier.get_results(logits, ids)
+                class_name = result[0][1]  # 得到预测出来的分类名称
+                subclass, topclass = self.classifier.dic_label[class_name].split(',') # 根据名称查找大类和小类名称
+                # print('返回类别成功')
+                # logging.info("get result done of doc_id%s"%(_doc_id))
+                cost_time["result"] = time.time()-start_time
+
+                data_res = {"class":topclass, "class_name":class_name, "subclass":subclass}
+                data_res["success"] = True
+                data_res["cost_time"] = cost_time
+
+                #print(prem)
+                # data_res = {'predict':result[0][1]}
+                # data_res["cost_time"] = cost_time
+                # data_res["success"] = True
+                #return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+            else:
+                data_res = {"success":False,"msg":"content not passed"}
+
+
+        except Exception as e:
+            traceback.print_exc()
+            data_res = {"success":False,"msg":str(e)}
+            logging.error('Exception:%s'%str(e))
+        # 以json形式返回结果
+        #_resp = json.dumps(data_res,cls=MyEncoder)
+        #print(str(data["flag"])+str(data))
+        logging.info("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
+        list_result.append(data_res)
+
+    def initialize(self):
+        """ load module, executed once at the start of the service
+             do service intialization and load models in this function.
+        """'''
+        '''
+        self.classifier = Text_Classifier()
+        self.timeout = 60
+        self.status_types = 5
+        self.timeOfType = self.timeout//self.status_types
+        logging.info('初始化完成, 服务端口15000')
+        print('初始化完成, 服务端口15000')
+        
+        
+    def pre_proccess(self, data):
+        """ data format pre process
+        """
+        x, y = data.split(b' ')
+        return int(x), int(y)
+    def post_process(self, data):
+        """ proccess after process
+        """
+        return bytes(data, encoding='utf8')
+    
+    
+    def process(self, data):
+        """ process the request data
+        """
+        try:
+            data = data.decode("utf8")
+            data = json.loads(data,encoding="utf8")
+            _timeout = self.timeout
+
+            status_code = 200
+            if "timeout" in data:
+                _timeout = data["timeout"]
+            list_result = []
+            t = Thread(target=self.run_thread,args=(data,list_result))
+            start_time = time.time()
+            t.start()
+            t.join(_timeout)
+            if t.is_alive():
+                stop_thread(t)
+                status_code = 302#超时被kill
+                data_res = {"success":False,"msg":"timeout"}
+            else:
+                status_code += int((time.time()-start_time)//self.timeOfType+1)
+                data_res = list_result[0]
+            _resp = json.dumps(data_res,cls=MyEncoder)
+
+            return self.post_process(_resp),status_code
+        except Exception as e:
+            pass
+        return self.post_process(json.dumps({},cls=MyEncoder)),200
+
+
+def main():
+    # 创建一个logging对象
+    logger = logging.getLogger()
+    # 创建一个文件对象
+    fh = logging.FileHandler('log_dir/esa_classifier_pai.log', encoding='utf-8')
+    # 创建一个屏幕对象
+    sh = logging.StreamHandler()
+    # 配置显示格式
+    formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
+    fh.setFormatter(formatter) # 把格式绑定到两个对象上
+    sh.setFormatter(formatter)
+
+    logger.addHandler(fh) # 将两个句柄绑定到logger
+    logger.addHandler(sh)
+
+    logger.setLevel(10)
+    fh.setLevel(10)
+    sh.setLevel(30)
+
+    allspark.default_properties().put("rpc.keepalive", 250000)
+    allspark.default_properties().put("rpc.max_queue_size", 100)
+    # 本地运行执行下面代码,阿里云上不要参数
+    runner = MyProcessor(worker_threads=20,worker_processes=1,endpoint="0.0.0.0:15000")
+
+    #PAI平台运行
+    #runner = MyProcessor()
+    runner.run()
+        
+if __name__ == '__main__':
+    main()
+    # paramter worker_threads indicates concurrency of processing
+    #本地运行
+    # tf.app.run()
+    # allspark.default_properties().put("rpc.keepalive", 60000)
+    # runner = MyProcessor(worker_threads=5,worker_processes=1,endpoint="0.0.0.0:15011")
+    #
+    #
+    # #PAI平台运行
+    # # runner = MyProcessor()
+    # runner.run()

+ 289 - 0
BiddingKG/dl/industry/data_util.py

@@ -0,0 +1,289 @@
+# encoding=utf-8
+import os
+import re
+import pickle
+import gensim
+import numpy as np
+import pandas as pd
+from pyhanlp import *
+import keras.backend as K
+from keras.preprocessing.sequence import pad_sequences
+
+# curdir = os.getcwd()
+curdir = os.path.dirname(__file__)
+def load(path):
+    '''
+    pickle 加载pkl 文件 
+    '''
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+
+def get_remove_word():
+    '''
+    加载停用词、不重要的词
+    '''
+    stopwords_path = curdir + '/pickle_1/bidi_classify_stop_words.csv' # 停用词文件 
+    # stopwords_path = '/home/python/projects_deeplearning/TextSplit/new_model/pickle_1/bidi_classify_stop_words_20200316.csv' # 20200317新增部分非关键词停用词
+    df_stopwords = pd.read_csv(stopwords_path)
+    remove_word  = df_stopwords['stopword'].values.tolist()
+    return remove_word
+
+def get_embedding():
+    '''
+    加载文件,返回词典、keras tokennizer对象,词向量矩阵
+    '''
+    word_index = load(curdir + '/pickle_1/word_index_955871.pk') #加载词典文件 word:id
+    tokenizer = load(curdir + '/pickle_1/tokenizer_955871.pk')   # 加载训练后keras tokenizer对象
+    w2v_model_path = curdir + '/pickle_1/thr_100_model.vector'      # 加载词向量文件
+    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path,binary=True)
+    embedding_matrix = np.random.random((len(word_index) + 1, 100))
+    # embedding_matrix = np.zeros((len(word_index) + 1, 100))  # 随机初始化改成0初始化
+    count_not_in_model = 0
+    count_in_model = 0
+    for word, i in word_index.items():
+        if word in w2v_model:
+            count_in_model += 1
+            embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')
+        else:
+            count_not_in_model += 1
+    return word_index, tokenizer, embedding_matrix
+
+def get_label():
+    '''
+    加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称
+    '''
+    # label_mapping = load('/home/python/projects_deeplearning/TextSplit/new_model/pickle_1/label_mapping_f.pk') # 耔录原来211分类模型
+    # label_mapping = load(curdir + '/pickle_1/label_mapping210.pkl') # 2月份去除教育设备分类后210类
+    label_mapping = load(curdir + '/pickle_1/id2label.pkl') # 20200928 修改标注标准,完成重新标注后总有203类
+    labels10 = list(label_mapping.values())
+    return label_mapping,labels10
+
+def get_dic():
+    '''
+    加载类别字典,估计是新旧类别: 豆类、油料和薯类种植': '农业,农、林、牧、渔业', '蔬菜、食用菌及园艺作物种植': '农业,农、林、牧、渔业'
+    '''
+    # dic_label_path = curdir + '/pickle_1/class_subclass_dic211.pk'
+    dic_label_path = curdir + '/pickle_1/class2dalei_menlei.pkl'
+    dic_label = load(dic_label_path)
+    return dic_label
+
+def model_in(r1, label_mapping, id):
+    '''
+    获取每个文章的中文类别名称
+    @Argus: r1:np.array 预测结果 ; label_mapping:分类类别字典 0: '安防系统
+    @Return:中文分类名称 
+    '''
+    all_end = r1
+    aa2 = []
+    for i in range(all_end.shape[0]):
+        c1 = label_mapping[np.argmax(all_end[i])]
+        aa2.append(c1)
+    union = []
+    for x in range(len(id)):
+        union.append([id[x],aa2[x]])
+    return union
+
+def convertJlistToPlist(jList):
+    '''
+    将javaList 转为pythonlist     
+    '''
+    # print('分词完成,准备转为Python list')
+    ret = []
+    if jList is None:
+        return ret
+    for i in range(jList.size()):
+        ret.append(str(jList.get(i)))
+    return ret 
+
+def clean_RmWord(text, remove_word):
+    '''
+    去除没用的词语
+    '''
+    text_copy = text.copy()
+    for i in text:
+        if i in remove_word:
+            text_copy.remove(i)
+    text_copy = " ".join(text_copy)
+    return text_copy
+
+def handle_doc1(article_set10_1, remove_word):
+    '''
+    句子分词并删除单字、重复、无关词语
+    @Argus: article_set10_1: 包含待处理字符串的Series
+    @Return: 处理后的结果
+    '''
+    HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')
+    HanLP.Config.ShowTermNature = False
+    # print('定义HanLP config 完成')
+    article_set10_seg_1 = article_set10_1.map(lambda x: convertJlistToPlist(HanLP.segment(x)))
+    # print('hanlp 分词后 : ', ','.join(article_set10_seg_1[0]))
+    # print('分词完成')
+    # article_set10_seg_1 = article_set10_seg_1.map(lambda x: ' '.join(word for word in x if len(word) > 1)) # 删除单个字
+    # print('删除单个字完成')
+    # article_set10_seg_1 = article_set10_seg_1.map(lambda x: ' '.join(word for word in x if len(word) > 1 and re.search('政府|公司|时间', word)==None))  # 删除单个字及某些词
+    # article_set10_seg_rm = article_set10_seg_1.map(lambda x: clean_RmWord(x.split(), remove_word)) # 删除无用、重复词语
+    article_set10_seg_rm = article_set10_seg_1.map(lambda x: ' '.join(word for word in x))  # 临时修改调用
+    # print('删除无用、重复词语完成')
+    article_set10_seg_rm = article_set10_seg_rm.map(lambda x: x.split())
+    return article_set10_seg_rm
+
+def cleanSeg(text):
+    '''
+    清除干扰字符(英文、日期、数字、标点符号)
+    '''
+    # text = re.sub('[a-zA-Z]', '', text)
+    # text = text.replace('\n', ' ')
+    # text = re.sub(r"-", " ", text)
+    # text = re.sub(r"\d+/\d/\d+", "", text)
+    # text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text)
+    # text = re.sub(r"[\w]+@[\.\w]+", "", text)
+    # text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)
+    # pure_text = ''
+    # for letter in text:
+    #     if letter.isalpha() or letter == ' ':
+    #         pure_text += letter
+    # text = ' '.join(word for word in pure_text.split() if len(word) > 1)
+    # text = text.replace(' ', '')
+    text = re.sub("<\s*script[^>]*>.*?<\s*/\s*script\s*>", "", text)
+    text = re.sub("<\s*stype[^>]*>.*<\s*/\s*stype\s*>", "", text)
+    text = re.sub("</?\w+[^>]*>", "", text)
+    text = re.sub('<!--.*-->|{Font|border.*}|{.*font.*}', '', text)
+    text = re.sub('品目|\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]','',text)
+    # text_list = [re.sub('\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]','',text) for text in text.split('\n')]
+    # text = ''.join(text_list)
+    return text 
+
+def fetch_sub_data_1(data, num):
+    '''
+    获取文本前N个字符
+    '''
+    return data[:num]
+
+def data_set(text):
+    '''
+    保持顺序词语去重
+    '''
+    l2 = []
+    [l2.append(i) for i in text if i not in l2]
+    return l2
+
+def clean_word(article_set10,remove_word):
+    """
+    清理数据,清除符号、字母、数字等,统一文章长度,对句子进行分词,删除单字、重复、无关词语、停用词
+    :param article_set10: 原数据,list
+    :param remove_word: 停用词表,list
+    :return: Series
+    """
+    article_set10_1 = pd.Series(article_set10)
+    article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x))  # 清除干扰字符(英文、日期、数字、标点符号)
+    article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500))  # 获取文本前N个字符
+    # test
+    article_set10_seg_rm = handle_doc1(article_set10_1, remove_word) # 句子分词并删除单字、重复、无关词语
+    # test
+    x_train_df_10 = article_set10_seg_rm.copy()
+    x_train_df_10 = x_train_df_10.map(lambda x: data_set(x))  #  保持顺序词语去重
+    return x_train_df_10
+
+def clean_word_with_tokenizer(article_set10,remove_word,tokenizer):
+    """
+    清理数据,清除符号、字母、数字、停用词,分词
+    :param article_set10: 原数据,list
+    :param remove_word: 停用词表,list
+    :return: Series
+    """
+    # print('clean_word_with_tokenizer 开始')
+    id = [i[0] for i in article_set10]
+    article_set10 = [i[1] for i in article_set10]
+    article_set10_1 = pd.Series(article_set10)
+    article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x))
+    article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500))
+    # test
+    # print('准备分词 ')
+    article_set10_seg_rm = handle_doc1(article_set10_1, remove_word)
+    # print(article_set10_seg_rm)
+    # test
+    # print('分词结束')
+    x_train_df_10 = article_set10_seg_rm.copy()
+    # x_train_df_10 = x_train_df_10.map(lambda x: data_set(x))  # 保持顺序词语去重 这里原来没有,比训练时少做了一步
+    sequences = tokenizer.texts_to_sequences(x_train_df_10)
+    padded_sequences = pad_sequences(sequences, maxlen=150, padding='post', truncating='post',value=0.0)
+    # print('返回数字化样本')
+    # left_word = [x[:-1] for x in padded_sequences]
+    # right_word = [x[1:] for x in padded_sequences]
+    # left_pad = pad_sequences(left_word, maxlen=100, value=0.0)
+    # right_pad = pad_sequences(right_word, maxlen=100, padding='post', truncating='post', value=0.0)
+    return padded_sequences, id
+
+def recall(y_true, y_pred):
+    '''
+    计算召回率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        召回率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    if c3 == 0:
+        return 0
+    recall = c1 / c3
+    return recall
+
+
+def f1_score(y_true, y_pred):
+    '''
+    计算F1
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        F1值
+    '''
+
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    precision = c1 / c2
+    if c3 == 0:
+        recall = 0
+    else:
+        recall = c1 / c3
+    f1_score = 2 * (precision * recall) / (precision + recall)
+    return f1_score
+
+
+def precision(y_true, y_pred):
+    '''
+    计算精确率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        精确率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = c1 / c2
+    return precision
+
+if __name__ == '__main__':
+    remove_word = get_remove_word()  # 加载停用词、不重要的词
+    word_index, tokenizer, embedding_matrix = get_embedding()  # 加载文件,返回词典、keras tokennizer对象,词向量矩阵
+    label_mapping, labels = get_label()  # 加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称
+    dic_label = get_dic()  # 加载分类 大类中类
+
+    file = '/data/python/lsm/test_11_relabel_0304.csv'  # 20200304重新标注的数据
+    # file = '/home/python/projects_deeplearning/TextSplit/test_11.csv' # 耔录原来标注数据
+    df = pd.read_csv(file)
+    text = df.loc[843]["file"]
+    text = clean_word([text], remove_word)
+    # text = cleanSeg(text=text)
+    print(text)
+    print()

+ 116 - 0
BiddingKG/dl/industry/main.py

@@ -0,0 +1,116 @@
+# encoding=utf-8
+import os
+#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"   # 指定使用CPU运行
+import pickle
+import pandas as pd
+import tensorflow as tf
+from text_classifier_pai.data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
+# from data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
+import keras.backend as K
+from keras.layers import Input, Embedding, Bidirectional, GRU, Dropout, Dense, Concatenate,Lambda,LSTM
+from keras.models import Model
+# from keras import models, metrics
+from keras.callbacks import ModelCheckpoint
+from keras.engine.topology import Layer
+from keras.optimizers import Adam,SGD
+
+class Attention(Layer):
+    def __init__(self, **kwargs):
+        super(Attention, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # W: (EMBED_SIZE, 1)
+        # b: (MAX_TIMESTEPS, 1)
+        # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
+        self.W = self.add_weight(name="W_{:s}".format(self.name),
+                                 shape=(input_shape[-1], 1),
+                                 initializer="normal")
+        self.b = self.add_weight(name="b_{:s}".format(self.name),
+                                 shape=(input_shape[1], 1),
+                                 initializer="zeros")
+        self.u = self.add_weight(name="u_{:s}".format(self.name),
+                                 shape=(input_shape[1], input_shape[1]),
+                                 initializer="normal")
+        super(Attention, self).build(input_shape)
+
+    def call(self, x, mask=None):
+        # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+        # et: (BATCH_SIZE, MAX_TIMESTEPS)
+        et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
+        # at: (BATCH_SIZE, MAX_TIMESTEPS)
+        at = K.dot(et, self.u)
+        at = K.exp(at)
+        if mask is not None:
+            at *= K.cast(mask, K.floatx())
+        # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+        at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+        atx = K.expand_dims(at, axis=-1)
+        ot = atx * x
+        # output: (BATCH_SIZE, EMBED_SIZE)
+        return K.sum(ot, axis=1)
+
+    def compute_mask(self, input, input_mask=None):
+        # do not pass the mask to the next layers
+        return None
+
+    def compute_output_shape(self, input_shape):
+        # output shape: (BATCH_SIZE, EMBED_SIZE)
+        return (input_shape[0], input_shape[-1])
+
+    def get_config(self):
+        return super(Attention, self).get_config()
+
+class Text_Classifier():
+    def __init__(self):
+        self.remove_word = get_remove_word()  # 加载停用词、不重要的词
+        self.word_index, self.tokenizer, self.embedding_matrix = get_embedding()  # 加载文件,返回词典、keras tokennizer对象,词向量矩阵
+        self.label_mapping, self.labels = get_label()  # 加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称
+        self.dic_label = get_dic()  # 加载分类 大类中类
+        # self.checkpoint = '/home/python/lishimin/linuxPro/text_classifier_project/model/New_attentionGUR_embed100_relabel0311.h5'
+        self.graph = tf.get_default_graph()
+        self.model = self.bigru_attention_softmax(150, self.word_index, self.embedding_matrix, classes=203)
+        # self.model.load_weights(self.checkpoint)
+        self.model.load_weights(os.path.dirname(__file__)+'/pickle_1/AttentionGRUacc0.9_class203.model')
+    def bigru_attention_softmax(self,input_size, word_index, embedding_matrix, classes):
+        sent_inputs = Input(shape=(input_size,), dtype="float32")
+        sent_emb = Embedding(input_dim=len(word_index) + 1,
+                             output_dim=100,
+                             mask_zero=True,
+                             weights=[embedding_matrix])(sent_inputs)
+        sent_enc = Bidirectional(GRU(512,dropout=0.5, recurrent_dropout=0.5,
+                                     return_sequences=True))(sent_emb)
+        embeddings = Dropout(0.5)(sent_enc)
+        sent_att1 = Attention()(embeddings)
+        fc2_dropout = Dropout(0.5)(sent_att1)
+        # fc1 = Dense(1024, activation="relu")(fc1_dropout)
+        # fc2_dropout = Dropout(0.5)(fc1)
+        sent_pred = Dense(classes, activation="softmax")(fc2_dropout)
+        model = Model(inputs=sent_inputs, outputs=sent_pred)
+        # model.summary()
+        return model
+
+    def process(self,text_list):
+        ContentIDs = [[i, text] for i, text in enumerate(text_list)]
+        features, ids = clean_word_with_tokenizer(ContentIDs, self.remove_word, self.tokenizer)
+        return features, ids
+
+    def predict(self, features, ids):
+        with self.graph.as_default():
+            logits = self.model.predict(features)
+        return logits, ids
+
+    def get_results(self, logits, ids):
+        return model_in(logits, self.label_mapping, ids)
+
+if __name__ == '__main__':
+    file = '/data/python/lsm/test_11_relabel_0304.csv'  # 20200304重新标注的数据
+    # file = '/home/python/projects_deeplearning/TextSplit/test_11.csv' # 耔录原来标注数据
+    df = pd.read_csv(file)
+    text_list = list(df['file'])
+    classifier = Text_Classifier()
+    features, ids = classifier.process([text_list[843]])
+    logits, ids = classifier.predict(features, ids)
+    results = classifier.get_results(logits, ids)
+    print(results)
+

+ 272 - 0
BiddingKG/dl/industry/run_industry_server.py

@@ -0,0 +1,272 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun  1 18:03:03 2018
+
+@author: DONG
+"""
+import sys
+import os
+from flask import Flask, jsonify
+from flask import abort
+from flask import request
+
+
+sys.path.append(os.path.dirname(__file__)+"/..")
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+app = Flask(__name__)
+app.config['JSON_AS_ASCII'] = False
+
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+
+from text_classifier_pai.main import Text_Classifier
+import numpy as np
+import ctypes
+import inspect
+from threading import Thread
+import traceback
+import json
+import time
+import uuid
+import re
+from bs4 import BeautifulSoup
+
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+sys.path.append(os.path.abspath("."))
+
+classifier = Text_Classifier()
+
+#自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+def _async_raise(tid, exctype):
+    """raises the exception, performs cleanup if needed"""
+    tid = ctypes.c_long(tid)
+    if not inspect.isclass(exctype):
+        exctype = type(exctype)
+    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
+    if res == 0:
+        raise ValueError("invalid thread id")
+    elif res != 1:
+        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
+        raise SystemError("PyThreadState_SetAsyncExc failed")
+
+def stop_thread(thread):
+    _async_raise(thread.ident, SystemExit)
+
+
+def article_limit(soup,limit_words=30000):
+    sub_space = re.compile("\s+")
+    def soup_limit(_soup,_count,max_count=30000,max_gap=500):
+        """
+        :param _soup: soup
+        :param _count: 当前字数
+        :param max_count: 字数最大限制
+        :param max_gap: 超过限制后的最大误差
+        :return:
+        """
+        _gap = _count - max_count
+        _is_skip = False
+        next_soup = None
+        while len(_soup.find_all(recursive=False)) == 1 and \
+                _soup.get_text(strip=True) == _soup.find_all(recursive=False)[0].get_text(strip=True):
+            _soup = _soup.find_all(recursive=False)[0]
+        if len(_soup.find_all(recursive=False)) == 0:
+            _soup.string = str(_soup.get_text())[:max_count-_count]
+            _count += len(re.sub(sub_space, "", _soup.string))
+            _gap = _count - max_count
+            next_soup = None
+        else:
+            for _soup_part in _soup.find_all(recursive=False):
+                if not _is_skip:
+                    _count += len(re.sub(sub_space, "", _soup_part.get_text()))
+                    if _count >= max_count:
+                        _gap = _count - max_count
+                        if _gap <= max_gap:
+                            _is_skip = True
+                        else:
+                            _is_skip = True
+                            next_soup = _soup_part
+                            _count -= len(re.sub(sub_space, "", _soup_part.get_text()))
+                            continue
+                else:
+                    _soup_part.decompose()
+        return _count,_gap,next_soup
+
+    text_count = 0
+    have_attachment = False
+    attachment_part = None
+    for child in soup.find_all(recursive=True):
+        if child.name == 'div' and 'class' in child.attrs:
+            if "richTextFetch" in child['class']:
+                child.insert_before("##attachment##")
+                attachment_part = child
+                have_attachment = True
+                break
+    if not have_attachment:
+        # 无附件
+        if len(re.sub(sub_space, "", soup.get_text())) > limit_words:
+            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=500)
+            while n_soup:
+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
+
+    else:
+        # 有附件
+        _text = re.sub(sub_space, "", soup.get_text())
+        _text_split = _text.split("##attachment##")
+        if len(_text_split[0])>limit_words:
+            main_soup = attachment_part.parent
+            main_text = main_soup.find_all(recursive=False)[0]
+            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=500)
+            while n_soup:
+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
+        if len(_text_split[1])>limit_words:
+            # attachment_html纯文本,无子结构
+            if len(attachment_part.find_all(recursive=False))==0:
+                attachment_part.string = str(attachment_part.get_text())[:limit_words]
+            else:
+                attachment_text_nums = 0
+                attachment_skip = False
+                for part in attachment_part.find_all(recursive=False):
+                    if not attachment_skip:
+                        last_attachment_text_nums = attachment_text_nums
+                        attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
+                        if attachment_text_nums>=limit_words:
+                            part.string = str(part.get_text())[:limit_words-last_attachment_text_nums]
+                            attachment_skip = True
+                    else:
+                        part.decompose()
+
+    return soup
+
+def run_thread(data,list_result):
+    # data = data.decode("utf8")
+    # data = json.loads(data,encoding="utf8")
+    k = str(uuid.uuid4())
+    cost_time = dict()
+    if "doc_id" in data:
+        _doc_id = data['doc_id']
+    else:
+        _doc_id = ""
+    if "title" in data:
+        _title = data["title"]
+    else:
+        _title = ""
+    data_res = ""
+    try:
+        if "content" in data:
+            # logging.info("get request of doc_id:%s"%(_doc_id))
+            k = str(uuid.uuid4())
+            cost_time = dict()
+            content = data['content']
+
+            if len(content)>50000:
+                _soup = BeautifulSoup(content,"lxml")
+                _soup = article_limit(_soup,50000)
+                content = str(_soup)
+
+            start_time = time.time()
+            # print('准备预处理,文章内容:',content[:20])
+            process, ids = classifier.process([content])
+            # logging.info("get preprocessed done of doc_id%s"%(_doc_id))
+            # print('预处理完成')
+            cost_time["preprocess"] = time.time()-start_time
+            # cost_time.update(_cost_time)
+
+            start_time = time.time()
+            # print('开始预测')
+            # with self.classifier.sess.graph.as_default():
+            logits, ids = classifier.predict(process, ids)
+            # print('预测完成')
+            # logging.info("get predict done of doc_id%s"%(_doc_id))
+            cost_time["predict"] = time.time()-start_time
+
+            start_time = time.time()
+            # print('准备提取结果')
+            result = classifier.get_results(logits, ids)
+            class_name = result[0][1]  # 得到预测出来的分类名称
+            subclass, topclass = classifier.dic_label[class_name].split(',') # 根据名称查找大类和小类名称
+            # print('返回类别成功')
+            # logging.info("get result done of doc_id%s"%(_doc_id))
+            cost_time["result"] = time.time()-start_time
+
+            data_res = {"class":topclass, "class_name":class_name, "subclass":subclass}
+            data_res["success"] = True
+            data_res["cost_time"] = cost_time
+
+            #print(prem)
+            # data_res = {'predict':result[0][1]}
+            # data_res["cost_time"] = cost_time
+            # data_res["success"] = True
+            #return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+        else:
+            data_res = {"success":False,"msg":"content not passed"}
+
+
+    except Exception as e:
+        traceback.print_exc()
+        data_res = {"success":False,"msg":str(e)}
+        logging.error('Exception:%s'%str(e))
+    # 以json形式返回结果
+    #_resp = json.dumps(data_res,cls=MyEncoder)
+    #print(str(data["flag"])+str(data))
+    logging.info("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
+    list_result.append(data_res)
+
+
+
+@app.route('/industry_extract', methods=['POST'])
+def text_predict():
+
+    try:
+        data = request.json
+
+        status_code = 200
+        if "timeout" in data:
+            _timeout = data["timeout"]
+        list_result = []
+        t = Thread(target=run_thread,args=(data,list_result))
+        start_time = time.time()
+        t.start()
+        t.join(_timeout)
+        if t.is_alive():
+            stop_thread(t)
+            status_code = 302#超时被kill
+            data_res = {"success":False,"msg":"timeout"}
+        else:
+            data_res = list_result[0]
+        _resp = json.dumps(data_res,cls=MyEncoder,ensure_ascii=False)
+
+        return _resp,201
+    except Exception as e:
+        traceback.print_exc()
+        data_res = {"success":False,"msg":"error:%s"%(str(e))}
+        _resp = json.dumps(data_res)
+    return _resp,500
+
+def getPort(argv):
+    port = 15000
+    for item in argv:
+        _l = str(item).split("port=")
+        if len(_l)>1:
+            port = int(_l[-1])
+            break
+    return port
+
+if __name__ == '__main__':
+    port = getPort(argv=sys.argv)
+    app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
+    ("ContentExtractor running")
+    # app.run()

+ 389 - 168
BiddingKG/dl/interface/Preprocessing.py

@@ -8,7 +8,7 @@ import time
 import codecs
 
 from BiddingKG.dl.ratio.re_ratio import extract_ratio
-from BiddingKG.dl.table_head.predict import predict
+# from BiddingKG.dl.table_head.predict import predict
 
 sys.setrecursionlimit(1000000)
 sys.path.append(os.path.abspath("../.."))
@@ -384,34 +384,38 @@ def tableToText(soup):
         set_item = set()
         height = len(inner_table)
         width = len(inner_table[0])
+        empty_set = set()
         for i in range(height):
             for j in range(width):
                 item = inner_table[i][j][0]
-                set_item.add(item)
+                if item.strip()=="":
+                    empty_set.add(item)
+                else:
+                    set_item.add(item)
         list_item = list(set_item)
-        x = []
-        for item in list_item:
-            x.append(getPredictor("form").encode(item))
-        predict_y = getPredictor("form").predict(np.array(x),type="item")
-        _dict = dict()
-        
-        for item,values in zip(list_item,list(predict_y)):
-            _dict[item] = values[1]
-            # print("##",item,values)
-        #print(_dict)
-        for i in range(height):
-            for j in range(width):
-                item = inner_table[i][j][0]
-                inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
+        if list_item:
+            x = []
+            for item in list_item:
+                x.append(getPredictor("form").encode(item))
+            predict_y = getPredictor("form").predict(np.array(x),type="item")
+            _dict = dict()
+
+            for item,values in zip(list_item,list(predict_y)):
+                _dict[item] = values[1]
+                # print("##",item,values)
+            #print(_dict)
+            for i in range(height):
+                for j in range(width):
+                    item = inner_table[i][j][0]
+                    if item not in empty_set:
+                        inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
 
         # print("=====")
         # for item in inner_table:
         #     print(item)
         # print("======")
-
         repairTable(inner_table)
         head_list = sliceTable(inner_table)
-
         
         return inner_table,head_list
 
@@ -985,15 +989,30 @@ def tableToText(soup):
                 if inner_table[h][w][0]==fix_value:
                     inner_table[h][w][0] = ""
     
-    def trunTable(tbody):
+    def trunTable(tbody,in_attachment):
+        # print(tbody.find('tbody'))
+        # 附件中的表格,排除异常错乱的表格
+        if in_attachment:
+            if tbody.name=='table':
+                _tbody = tbody.find('tbody')
+                if _tbody is None:
+                    _tbody = tbody
+            else:
+                _tbody = tbody
+            _td_len_list = []
+            for _tr in _tbody.find_all(recursive=False):
+                len_td = len(_tr.find_all(recursive=False))
+                _td_len_list.append(len_td)
+            if len(list(set(_td_len_list)))>8:
+                return None
         fixSpan(tbody)
         inner_table = getTable(tbody)
         inner_table = fixTable(inner_table)
         if len(inner_table)>0 and len(inner_table[0])>0:
             #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
             #inner_table,head_list = setHead_inline(inner_table)
-            # inner_table, head_list = setHead_initem(inner_table,pat_head)
-            inner_table, head_list = set_head_model(inner_table)
+            inner_table, head_list = setHead_initem(inner_table,pat_head)
+            # inner_table, head_list = set_head_model(inner_table)
             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
             # print(inner_table)
             # for begin in range(len(head_list[:-1])):
@@ -1033,20 +1052,36 @@ def tableToText(soup):
                 ul.get_text(), re.S)))>3:
             ul.extract()
 
-    tbodies = soup.find_all('table')
+    # tbodies = soup.find_all('table')
     # 遍历表格中的每个tbody
+    tbodies = []
+    in_attachment = False
+    for _part in soup.find_all():
+        if _part.name=='table':
+            tbodies.append((_part,in_attachment))
+        elif _part.name=='div':
+            if 'class' in _part.attrs and "richTextFetch" in _part['class']:
+                in_attachment = True
     #逆序处理嵌套表格
     for tbody_index in range(1,len(tbodies)+1):
-        tbody = tbodies[len(tbodies)-tbody_index]
-        inner_table = trunTable(tbody)
+        tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
+        inner_table = trunTable(tbody,_in_attachment)
         list_innerTable.append(inner_table)
 
-    tbodies = soup.find_all('tbody')
+    # tbodies = soup.find_all('tbody')
     # 遍历表格中的每个tbody
+    tbodies = []
+    in_attachment = False
+    for _part in soup.find_all():
+        if _part.name == 'tbody':
+            tbodies.append((_part, in_attachment))
+        elif _part.name == 'div':
+            if 'class' in _part.attrs and "richTextFetch" in _part['class']:
+                in_attachment = True
     #逆序处理嵌套表格
     for tbody_index in range(1,len(tbodies)+1):
-        tbody = tbodies[len(tbodies)-tbody_index]
-        inner_table = trunTable(tbody)
+        tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
+        inner_table = trunTable(tbody,_in_attachment)
         list_innerTable.append(inner_table)
 
     return soup
@@ -1098,7 +1133,7 @@ def get_preprocessed_outline(soup):
             deal_part = body_child[0]
     if len(deal_part.find_all(recursive=False))>2:
         deal_part = deal_part.parent
-    skip_tag = ['turntable', 'tbody', 'th', 'tr', 'td', 'table','<thead>','<tfoot>']
+    skip_tag = ['turntable', 'tbody', 'th', 'tr', 'td', 'table','thead','tfoot']
     for part in deal_part.find_all(recursive=False):
         # 查找解析文本的主干部分
         is_main_text = False
@@ -1196,7 +1231,7 @@ def segment(soup,final=True):
             # text = re.sub("\s+","##space##",text)
             return text
     segList = ["title"]
-    commaList = ["div","br","td","p"]
+    commaList = ["div","br","td","p","li"]
     #commaList = []
     spaceList = ["span"]
 
@@ -1205,15 +1240,15 @@ def segment(soup,final=True):
         tbodies = soup.find_all('table')
     # 递归遍历所有节点,插入符号
     for child in soup.find_all(recursive=True):
-
+        # print(child.name,child.get_text())
         if child.name in segList:
             child.insert_after("。")
         if child.name in commaList:
             child.insert_after(",")
-        if child.name == 'div' and 'class' in child.attrs:
-            # 添加附件"attachment"标识
-            if "richTextFetch" in child['class']:
-                child.insert_before("##attachment##")
+        # if child.name == 'div' and 'class' in child.attrs:
+        #     # 添加附件"attachment"标识
+        #     if "richTextFetch" in child['class']:
+        #         child.insert_before("##attachment##")
                 # print(child.parent)
         # if child.name in subspaceList:
         #     child.insert_before("#subs"+str(child.name)+"#")
@@ -1221,7 +1256,6 @@ def segment(soup,final=True):
         # if child.name in spaceList:
         #     child.insert_after(" ")
     text = str(soup.get_text())
-
     #替换英文冒号为中文冒号
     text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
     #替换为中文逗号
@@ -1273,7 +1307,10 @@ def segment(soup,final=True):
         if len(punc_del)>1:
             if len(punc_del.strip())>0:
                 if ":" in punc_del.strip():
-                    text = re.sub(punc_del,":",text)
+                    if "。" in punc_del.strip():
+                        text = re.sub(punc_del, ":。", text)
+                    else:
+                        text = re.sub(punc_del,":",text)
                 else:
                     text = re.sub(punc_del,punc_del.strip()[0],text)   #2021/12/09 修正由于某些标签后插入符号把原来符号替换
             else:
@@ -1620,107 +1657,213 @@ def get_preprocessed(articles, useselffool=False):
     return list_articles,list_sentences,list_entitys,list_outlines,cost_time
 
 def special_treatment(sourceContent, web_source_no):
-    if web_source_no == 'DX000202-1':
-         ser = re.search('中标供应商及中标金额:【((\w{5,20}-[\d,.]+,)+)】', sourceContent)
-         if ser:
-             new = ""
-             l = ser.group(1).split(',')
-             for i in range(len(l)):
-                 it = l[i]
-                 if '-' in it:
-                     role, money = it.split('-')
-                     new += '标段%d, 中标供应商: ' % (i + 1) + role + ',中标金额:' + money + '。'
-             sourceContent = sourceContent.replace(ser.group(0), new, 1)
-    elif web_source_no == '00753-14':
-        pcontent = sourceContent.find("div", id="pcontent")
-        pcontent = pcontent.find_all(recursive=False)[0]
-        first_table = None
-        for idx in range(len(pcontent.find_all(recursive=False))):
-            t_part = pcontent.find_all(recursive=False)[idx]
-            if t_part.name != "table":
-                break
-            if idx == 0:
-                first_table = t_part
-            else:
-                for _tr in t_part.find("tbody").find_all(recursive=False):
-                    first_table.find("tbody").append(_tr)
-                t_part.clear()
-    elif web_source_no == 'DX008357-11':
-        pcontent = sourceContent.find("div", id="pcontent")
-        pcontent = pcontent.find_all(recursive=False)[0]
-        error_table = []
-        is_error_table = False
-        for part in pcontent.find_all(recursive=False):
-            if is_error_table:
-                if part.name == "table":
-                    error_table.append(part)
-                else:
+    try:
+        if web_source_no == 'DX000202-1':
+             ser = re.search('中标供应商及中标金额:【((\w{5,20}-[\d,.]+,)+)】', sourceContent)
+             if ser:
+                 new = ""
+                 l = ser.group(1).split(',')
+                 for i in range(len(l)):
+                     it = l[i]
+                     if '-' in it:
+                         role, money = it.split('-')
+                         new += '标段%d, 中标供应商: ' % (i + 1) + role + ',中标金额:' + money + '。'
+                 sourceContent = sourceContent.replace(ser.group(0), new, 1)
+        elif web_source_no == '00753-14':
+            body = sourceContent.find("body")
+            body_child = body.find_all(recursive=False)
+            pcontent = body
+            if 'id' in body_child[0].attrs:
+                if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
+                    pcontent = body_child[0]
+            # pcontent = sourceContent.find("div", id="pcontent")
+            pcontent = pcontent.find_all(recursive=False)[0]
+            first_table = None
+            for idx in range(len(pcontent.find_all(recursive=False))):
+                t_part = pcontent.find_all(recursive=False)[idx]
+                if t_part.name != "table":
                     break
-            if part.name == "div" and part.get_text(strip=True) == "中标候选单位:":
-                is_error_table = True
-        first_table = None
-        for idx in range(len(error_table)):
-            t_part = error_table[idx]
-            # if t_part.name != "table":
-            #     break
-            if idx == 0:
-                for _tr in t_part.find("tbody").find_all(recursive=False):
-                    if _tr.get_text(strip=True) == "":
-                        _tr.decompose()
-                first_table = t_part
-            else:
-                for _tr in t_part.find("tbody").find_all(recursive=False):
-                    if _tr.get_text(strip=True) != "":
+                if idx == 0:
+                    first_table = t_part
+                else:
+                    for _tr in t_part.find("tbody").find_all(recursive=False):
                         first_table.find("tbody").append(_tr)
-                t_part.clear()
-    elif web_source_no == '18021-2':
-        pcontent = sourceContent.find("div", id="pcontent")
-        td = pcontent.find_all("td")
-        for _td in td:
-            if str(_td.string).strip() == "报价金额":
-                _td.string = "单价"
-    elif web_source_no == '13740-2':
-        # “xxx成为成交供应商”
-        re_match = re.search("[^,。]+成为[^,。]*成交供应商", sourceContent)
-        if re_match:
-            sourceContent = sourceContent.replace(re_match.group(), "成交人:" + re_match.group(), sourceContent)
-    elif web_source_no == '03786-10':
-        ser1 = re.search('中标价:([\d,.]+)', sourceContent)
-        ser2 = re.search('合同金额[((]万元[))]:([\d,.]+)', sourceContent)
-        if ser1 and ser2:
-            m1 = ser1.group(1).replace(',', '')
-            m2 = ser2.group(1).replace(',', '')
-            if float(m1) < 100000 and (m1.split('.')[0] == m2.split('.')[0] or m2 == '0'):
-                new = '中标价(万元):' + m1
-                sourceContent = sourceContent.replace(ser1.group(0), new, 1)
-    elif web_source_no=='00076-4':
-        ser = re.search('主要标的数量:([0-9一]+)\w{,3},主要标的单价:([\d,.]+)元?,合同金额:(.00),', sourceContent)
-        if ser:
-            num = ser.group(1).replace('一', '1')
-            try:
-                num = 1 if num == '0' else num
-                unit_price = ser.group(2).replace(',', '')
-                total_price = str(int(num) * float(unit_price))
-                new = '合同金额:' + total_price
-                sourceContent = sourceContent.replace('合同金额:.00', new, 1)
-            except Exception as e:
-                log('preprocessing.py special_treatment exception')
-    elif web_source_no=='DX000105-2':
-        if re.search("成交公示", sourceContent) and re.search(',投标人:', sourceContent) and re.search(',成交人:', sourceContent)==None:
-            sourceContent = sourceContent.replace(',投标人:', ',成交人:')
-    elif web_source_no in ['04080-3', '04080-4']:
-        ser = re.search('合同金额:([0-9,]+.[0-9]{3,})(.{,4})', sourceContent)
-        if ser and '万' not in ser.group(2):
-            sourceContent = sourceContent.replace('合同金额:', '合同金额(万元):')
-    elif web_source_no=='03761-3':
-        ser = re.search('中标价,([0-9]+)[.0-9]*%', sourceContent)
-        if ser and int(ser.group(1))>100:
-            sourceContent = sourceContent.replace(ser.group(0), ser.group(0)[:-1]+'元')
-    elif web_source_no=='00695-7':
-        ser = re.search('支付金额:', sourceContent)
-        if ser:
-            sourceContent = sourceContent.replace('支付金额:', '合同金额:')
-    return sourceContent
+                    t_part.clear()
+        elif web_source_no == 'DX008357-11':
+            body = sourceContent.find("body")
+            body_child = body.find_all(recursive=False)
+            pcontent = body
+            if 'id' in body_child[0].attrs:
+                if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
+                    pcontent = body_child[0]
+            # pcontent = sourceContent.find("div", id="pcontent")
+            pcontent = pcontent.find_all(recursive=False)[0]
+            error_table = []
+            is_error_table = False
+            for part in pcontent.find_all(recursive=False):
+                if is_error_table:
+                    if part.name == "table":
+                        error_table.append(part)
+                    else:
+                        break
+                if part.name == "div" and part.get_text(strip=True) == "中标候选单位:":
+                    is_error_table = True
+            first_table = None
+            for idx in range(len(error_table)):
+                t_part = error_table[idx]
+                # if t_part.name != "table":
+                #     break
+                if idx == 0:
+                    for _tr in t_part.find("tbody").find_all(recursive=False):
+                        if _tr.get_text(strip=True) == "":
+                            _tr.decompose()
+                    first_table = t_part
+                else:
+                    for _tr in t_part.find("tbody").find_all(recursive=False):
+                        if _tr.get_text(strip=True) != "":
+                            first_table.find("tbody").append(_tr)
+                    t_part.clear()
+        elif web_source_no == '18021-2':
+            body = sourceContent.find("body")
+            body_child = body.find_all(recursive=False)
+            pcontent = body
+            if 'id' in body_child[0].attrs:
+                if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
+                    pcontent = body_child[0]
+            # pcontent = sourceContent.find("div", id="pcontent")
+            td = pcontent.find_all("td")
+            for _td in td:
+                if str(_td.string).strip() == "报价金额":
+                    _td.string = "单价"
+        elif web_source_no == '13740-2':
+            # “xxx成为成交供应商”
+            re_match = re.search("[^,。]+成为[^,。]*成交供应商", sourceContent)
+            if re_match:
+                sourceContent = sourceContent.replace(re_match.group(), "成交人:" + re_match.group())
+        elif web_source_no == '03786-10':
+            ser1 = re.search('中标价:([\d,.]+)', sourceContent)
+            ser2 = re.search('合同金额[((]万元[))]:([\d,.]+)', sourceContent)
+            if ser1 and ser2:
+                m1 = ser1.group(1).replace(',', '')
+                m2 = ser2.group(1).replace(',', '')
+                if float(m1) < 100000 and (m1.split('.')[0] == m2.split('.')[0] or m2 == '0'):
+                    new = '中标价(万元):' + m1
+                    sourceContent = sourceContent.replace(ser1.group(0), new, 1)
+        elif web_source_no=='00076-4':
+            ser = re.search('主要标的数量:([0-9一]+)\w{,3},主要标的单价:([\d,.]+)元?,合同金额:(.00),', sourceContent)
+            if ser:
+                num = ser.group(1).replace('一', '1')
+                try:
+                    num = 1 if num == '0' else num
+                    unit_price = ser.group(2).replace(',', '')
+                    total_price = str(int(num) * float(unit_price))
+                    new = '合同金额:' + total_price
+                    sourceContent = sourceContent.replace('合同金额:.00', new, 1)
+                except Exception as e:
+                    log('preprocessing.py special_treatment exception')
+        elif web_source_no=='DX000105-2':
+            if re.search("成交公示", sourceContent) and re.search(',投标人:', sourceContent) and re.search(',成交人:', sourceContent)==None:
+                sourceContent = sourceContent.replace(',投标人:', ',成交人:')
+        elif web_source_no in ['04080-3', '04080-4']:
+            ser = re.search('合同金额:([0-9,]+.[0-9]{3,})(.{,4})', sourceContent)
+            if ser and '万' not in ser.group(2):
+                sourceContent = sourceContent.replace('合同金额:', '合同金额(万元):')
+        elif web_source_no=='03761-3':
+            ser = re.search('中标价,([0-9]+)[.0-9]*%', sourceContent)
+            if ser and int(ser.group(1))>100:
+                sourceContent = sourceContent.replace(ser.group(0), ser.group(0)[:-1]+'元')
+        elif web_source_no=='00695-7':
+            ser = re.search('支付金额:', sourceContent)
+            if ser:
+                sourceContent = sourceContent.replace('支付金额:', '合同金额:')
+        return sourceContent
+    except Exception as e:
+        log('特殊数据源: %s 预处理特别修改抛出异常: %s'%(web_source_no, e))
+        return sourceContent
+
+def article_limit(soup,limit_words=30000):
+    sub_space = re.compile("\s+")
+    def soup_limit(_soup,_count,max_count=30000,max_gap=500):
+        """
+        :param _soup: soup
+        :param _count: 当前字数
+        :param max_count: 字数最大限制
+        :param max_gap: 超过限制后的最大误差
+        :return:
+        """
+        _gap = _count - max_count
+        _is_skip = False
+        next_soup = None
+        while len(_soup.find_all(recursive=False)) == 1 and \
+                _soup.get_text(strip=True) == _soup.find_all(recursive=False)[0].get_text(strip=True):
+            _soup = _soup.find_all(recursive=False)[0]
+        if len(_soup.find_all(recursive=False)) == 0:
+            _soup.string = str(_soup.get_text())[:max_count-_count]
+            _count += len(re.sub(sub_space, "", _soup.string))
+            _gap = _count - max_count
+            next_soup = None
+        else:
+            for _soup_part in _soup.find_all(recursive=False):
+                if not _is_skip:
+                    _count += len(re.sub(sub_space, "", _soup_part.get_text()))
+                    if _count >= max_count:
+                        _gap = _count - max_count
+                        if _gap <= max_gap:
+                            _is_skip = True
+                        else:
+                            _is_skip = True
+                            next_soup = _soup_part
+                            _count -= len(re.sub(sub_space, "", _soup_part.get_text()))
+                            continue
+                else:
+                    _soup_part.decompose()
+        return _count,_gap,next_soup
+
+    text_count = 0
+    have_attachment = False
+    attachment_part = None
+    for child in soup.find_all(recursive=True):
+        if child.name == 'div' and 'class' in child.attrs:
+            if "richTextFetch" in child['class']:
+                child.insert_before("##attachment##")
+                attachment_part = child
+                have_attachment = True
+                break
+    if not have_attachment:
+        # 无附件
+        if len(re.sub(sub_space, "", soup.get_text())) > limit_words:
+            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=500)
+            while n_soup:
+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
+
+    else:
+        # 有附件
+        _text = re.sub(sub_space, "", soup.get_text())
+        _text_split = _text.split("##attachment##")
+        if len(_text_split[0])>limit_words:
+            main_soup = attachment_part.parent
+            main_text = main_soup.find_all(recursive=False)[0]
+            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=500)
+            while n_soup:
+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
+        if len(_text_split[1])>limit_words:
+            # attachment_html纯文本,无子结构
+            if len(attachment_part.find_all(recursive=False))==0:
+                attachment_part.string = str(attachment_part.get_text())[:limit_words]
+            else:
+                attachment_text_nums = 0
+                attachment_skip = False
+                for part in attachment_part.find_all(recursive=False):
+                    if not attachment_skip:
+                        last_attachment_text_nums = attachment_text_nums
+                        attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
+                        if attachment_text_nums>=limit_words:
+                            part.string = str(part.get_text())[:limit_words-last_attachment_text_nums]
+                            attachment_skip = True
+                    else:
+                        part.decompose()
+
+    return soup
 
 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
     '''
@@ -1736,14 +1879,13 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
         sourceContent = sourceContent.replace('<br/>', '<br>')
         sourceContent = re.sub("<br>(\s{0,}<br>)+","<br>",sourceContent)
-        for br_match in re.findall("[^>]+?<br>",sourceContent):
-            _new = re.sub("<br>","",br_match)
-            # <br>标签替换为<p>标签
-            if not re.search("^\s+$",_new):
-                _new = '<p>'+_new + '</p>'
-                # print(br_match,_new)
-                sourceContent = sourceContent.replace(br_match,_new,1)
-
+        # for br_match in re.findall("[^>]+?<br>",sourceContent):
+        #     _new = re.sub("<br>","",br_match)
+        #     # <br>标签替换为<p>标签
+        #     if not re.search("^\s+$",_new):
+        #         _new = '<p>'+_new + '</p>'
+        #         # print(br_match,_new)
+        #         sourceContent = sourceContent.replace(br_match,_new,1)
         _send_doc_id = article[3]
         _title = article[4]
         page_time = article[5]
@@ -1761,18 +1903,32 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         if web_source_no in ["00753-14","DX008357-11","18021-2"]:
             article_processed = special_treatment(article_processed, web_source_no)
         for _soup in article_processed.descendants:
-            # 识别无标签文本,添加<p>标签
+            # 识别无标签文本,添加<span>标签
             if not _soup.name and not _soup.parent.string and _soup.string.strip()!="":
                 # print(_soup.parent.string,_soup.string.strip())
-                _soup.wrap(article_processed.new_tag("p"))
+                _soup.wrap(article_processed.new_tag("span"))
         # print(article_processed)
+        # 正文和附件内容限制字数30000
+        article_processed = article_limit(article_processed,limit_words=30000)
         article_processed = get_preprocessed_outline(article_processed)
+        # print('article_processed')
         article_processed = tableToText(article_processed)
-        # print(article_processed)
         article_processed = segment(article_processed)
         article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
+        # 修复OCR金额中“,”、“。”识别错误
+        article_processed_list = article_processed.split("##attachment##")
+        if len(article_processed_list)>1:
+            attachment_text = article_processed_list[1]
+            for _match in re.finditer("\d。\d{2}",attachment_text):
+                _match_text = _match.group()
+                attachment_text = attachment_text.replace(_match_text,_match_text.replace("。","."),1)
+            for _match in re.finditer("(\d,\d{3})[,,.]",attachment_text):
+                _match_text = _match.group()
+                attachment_text = attachment_text.replace(_match_text,_match_text.replace(",",","),1)
+            article_processed_list[1] = attachment_text
+            article_processed = "##attachment##".join(article_processed_list)
         '''特别数据源对 预处理后文本 做特别修改'''
         if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2"]:
             article_processed = special_treatment(article_processed, web_source_no)
@@ -1994,7 +2150,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             cost_time[key_nerToken] = 0
         cost_time[key_nerToken] += round(time.time()-start_time,2)
 
-
+        company_dict = set()
+        company_index = dict((i,set()) for i in range(len(list_sentence)))
         for sentence_index in range(len(list_sentence)):
             list_sentence_entitys = []
 
@@ -2046,6 +2203,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
                     ner_entitys.append((b, e, 'org', entity))
 
+            for ner_entity in ner_entitys:
+                if ner_entity[2] in ['company','org']:
+                    company_dict.add((ner_entity[2],ner_entity[3]))
+                    company_index[sentence_index].add((ner_entity[0],ner_entity[1]))
             #识别package
 
             #识别实体
@@ -2096,7 +2257,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
                                   "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?元)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
                                   "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?元(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
             # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
@@ -2192,17 +2353,19 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             if k.split("_")[0]=="money":
                                 entity_text = v
                             if k.split("_")[0]=="unit":
-                                unit = v
+                                if v=='万元' or unit=="":  # 处理  预算金额(元):160万元 这种出现前后单位不一致情况
+                                    unit = v
                             if k.split("_")[0]=="text":
                                 text_beforeMoney = v
                             if k.split("_")[0]=="filter":
                                 filter = v
                             if re.search("filter_unit",k) is not None:
                                 filter_unit = True
-
+                    # print(_match.group())
+                    # print(entity_text,unit,text_beforeMoney,filter,filter_unit)
                     if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
                         if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0]-2):_match.span()[0]]):
-                            entity_text = re.sub('\d+,', '', entity_text)
+                             entity_text = re.sub('\d+,', '', entity_text)
                         else:
                             entity_text = entity_text.replace(',', '.')
                         # print(' 修正OCR识别小数点为逗号')
@@ -2294,7 +2457,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         else:
                             entity_text = str(getUnifyMoney(entity_text))
 
-                    if float(entity_text)>100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
+                    if float(entity_text)>100000000000 or float(entity_text)<100:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
                         # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
                         continue
 
@@ -2314,6 +2477,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
                             list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
                             # print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
+                            # print(entity_text,unit,notes)
+
                 else:
                     index += 1
 
@@ -2321,27 +2486,28 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
             error_text = ['交易','机构','教育','项目','公司','中标','开标','截标','监督','政府','国家','中国','技术','投标','传真','网址','电子邮',
                           '联系','联系电','联系地','采购代','邮政编','邮政','电话','手机','手机号','联系人','地址','地点','邮箱','邮编','联系方','招标','招标人','代理',
-                          '代理人','采购','附件','注意','登录','报名','踏勘']
+                          '代理人','采购','附件','注意','登录','报名','踏勘',"测试"]
             list_person_text = set(list_person_text + error_text)
             re_person = re.compile("联系人[::]([\u4e00-\u9fa5]工)|"
                                    "联系人[::]([\u4e00-\u9fa5]{2,3})(?=联系)|"
                                    "联系人[::]([\u4e00-\u9fa5]{2,3})")
             list_person = []
-            for match_result in re_person.finditer(sentence_text):
-                match_text = match_result.group()
-                entity_text = match_text[4:]
-                wordOffset_begin = match_result.start() + 4
-                wordOffset_end = match_result.end()
-                # print(text[wordOffset_begin:wordOffset_end])
-                # 排除一些不为人名的实体
-                if re.search("^[\u4e00-\u9fa5]{7,}([,。]|$)",sentence_text[wordOffset_begin:wordOffset_begin+20]):
-                    continue
-                if entity_text not in list_person_text and entity_text[:2] not in list_person_text:
-                    _person = dict()
-                    _person['body'] = entity_text
-                    _person['begin_index'] = wordOffset_begin
-                    _person['end_index'] = wordOffset_end
-                    list_person.append(_person)
+            if not in_attachment:
+                for match_result in re_person.finditer(sentence_text):
+                    match_text = match_result.group()
+                    entity_text = match_text[4:]
+                    wordOffset_begin = match_result.start() + 4
+                    wordOffset_end = match_result.end()
+                    # print(text[wordOffset_begin:wordOffset_end])
+                    # 排除一些不为人名的实体
+                    if re.search("^[\u4e00-\u9fa5]{7,}([,。]|$)",sentence_text[wordOffset_begin:wordOffset_begin+20]):
+                        continue
+                    if entity_text not in list_person_text and entity_text[:2] not in list_person_text:
+                        _person = dict()
+                        _person['body'] = entity_text
+                        _person['begin_index'] = wordOffset_begin
+                        _person['end_index'] = wordOffset_end
+                        list_person.append(_person)
             entity_type = "person"
             for person in list_person:
                 begin_index_temp = person['begin_index']
@@ -2480,6 +2646,61 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             list_sentence_entitys.sort(key=lambda x:x.begin_index)
             list_entitys_temp = list_entitys_temp+list_sentence_entitys
+        # 补充ner模型未识别全的company/org实体
+        for sentence_index in range(len(list_sentence)):
+            sentence_text = list_sentence[sentence_index].sentence_text
+            tokens = list_sentence[sentence_index].tokens
+            doc_id = list_sentence[sentence_index].doc_id
+            in_attachment = list_sentence[sentence_index].in_attachment
+            list_tokenbegin = []
+            begin = 0
+            for i in range(0, len(tokens)):
+                list_tokenbegin.append(begin)
+                begin += len(str(tokens[i]))
+            list_tokenbegin.append(begin + 1)
+            add_sentence_entitys = []
+            company_dict = sorted(list(company_dict),key=lambda x:len(x[1]),reverse=True)
+            for company_type,company_text in company_dict:
+                begin_index_list = findAllIndex(company_text,sentence_text)
+                for begin_index in begin_index_list:
+                    is_continue = False
+                    for t_begin,t_end in list(company_index[sentence_index]):
+                        if begin_index>=t_begin and begin_index+len(company_text)<=t_end:
+                            is_continue = True
+                            break
+                    if not is_continue:
+                        add_sentence_entitys.append((begin_index,begin_index+len(company_text),company_type,company_text))
+                        company_index[sentence_index].add((begin_index,begin_index+len(company_text)))
+                    else:
+                        continue
+            for ner_entity in add_sentence_entitys:
+                begin_index_temp = ner_entity[0]
+                end_index_temp = ner_entity[1]
+                entity_type = ner_entity[2]
+                entity_text = ner_entity[3]
+
+                if entity_type in ["org","company"] and not isLegalEnterprise(entity_text):
+                    continue
+
+                for j in range(len(list_tokenbegin)):
+                    if list_tokenbegin[j]==begin_index_temp:
+                        begin_index = j
+                        break
+                    elif list_tokenbegin[j]>begin_index_temp:
+                        begin_index = j-1
+                        break
+                begin_index_temp += len(str(entity_text))
+                for j in range(begin_index,len(list_tokenbegin)):
+                    if list_tokenbegin[j]>=begin_index_temp:
+                        end_index = j-1
+                        break
+                entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
+
+                #去掉标点符号
+                entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
+                entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
+                list_entitys_temp.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
+        list_entitys_temp.sort(key=lambda x:(x.sentence_index,x.begin_index))
         list_entitys.append(list_entitys_temp)
     return list_entitys
     

+ 119 - 23
BiddingKG/dl/interface/extract.py

@@ -42,7 +42,71 @@ class MyEncoder(json.JSONEncoder):
             return obj
         return json.JSONEncoder.default(self, obj)
 
-def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
+def extractCount(extract_dict):
+    # time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
+
+    if len(extract_dict):
+        _extract = extract_dict
+    else:
+        _extract = {}
+    # print(_extract)
+    dict_pack = _extract.get("prem",{})
+    extract_count = 0
+    list_code = _extract.get("code",[])
+    if len(list_code)>0:
+        project_code = list_code[0]
+    else:
+        project_code = ""
+    project_name = _extract.get("name","")
+    bidding_budget = ""
+    win_tenderer = ""
+    win_bid_price = ""
+    for _key in dict_pack.keys():
+        if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
+            extract_count += 1
+            if bidding_budget=="":
+                bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
+        for _role in dict_pack[_key]["roleList"]:
+            if isinstance(_role,list):
+                extract_count += 1
+                if _role[2]!='' and float(_role[2])>0:
+                    extract_count += 1
+                if _role[0]=="tenderee":
+                    tenderee = _role[1]
+                if _role[0]=="win_tenderer":
+                    if  win_tenderer=="":
+                        win_tenderer = _role[1]
+                    if _role[2]!='' and float(_role[2])>0:
+                        extract_count += 1
+                        if win_bid_price=="":
+                            win_bid_price = str(float(_role[2]))
+                if _role[0]=="agency":
+                    agency = _role[1]
+            if isinstance(_role,dict):
+                extract_count += 1
+                if "role_money" in _role:
+                    if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
+                        extract_count += 1
+                if _role.get("role_name")=="tenderee":
+                    tenderee = _role["role_text"]
+                if _role.get("role_name")=="win_tenderer":
+                    if  win_tenderer=="":
+                        win_tenderer = _role["role_text"]
+                    if "role_money" in _role:
+                        if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
+                            extract_count += 1
+                            if win_bid_price=="":
+                                win_bid_price = str(float(_role["role_money"]["money"]))
+                if _role["role_name"]=="agency":
+                    agency = _role["role_text"]
+
+    if project_code!="":
+        extract_count += 1
+    if project_name!="":
+        extract_count += 1
+    return extract_count
+
+def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchannel='',**kwargs):
     cost_time = dict()
 
     start_time = time.time()
@@ -52,10 +116,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time.update(_cost_time)
 
-    #依赖句子顺序
-    start_time = time.time() # 公告类型/生命周期提取
-    channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0], web_source_no=web_source_no)
-    cost_time["channel"] = round(time.time()-start_time,2)
+    # #依赖句子顺序
+    # start_time = time.time() # 公告类型/生命周期提取  此处作废 换到后面预测 2022/4/29
+    # channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0],
+    #                                                         web_source_no=web_source_no,original_docchannel=original_docchannel)
+    # cost_time["channel"] = round(time.time()-start_time,2)
 
     start_time = time.time() # 项目编号、名称提取
     codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
@@ -67,12 +132,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
     log("get prem done of doc_id%s"%(doc_id))
     cost_time["prem"] = round(time.time()-start_time,2)
 
-    start_time = time.time() # 产品名称及废标原因提取
-    fail = channel_dic['docchannel']['docchannel'] == "废标公告"
-    fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
-    # predictor.getPredictor("product").predict(list_sentences, list_entitys)
-    log("get product done of doc_id%s"%(doc_id))
-    cost_time["product"] = round(time.time()-start_time,2)
+    # start_time = time.time() # 产品名称及废标原因提取  此处作废 换到后面预测 2022/4/29
+    # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
+    # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
+    # # predictor.getPredictor("product").predict(list_sentences, list_entitys)
+    # log("get product done of doc_id%s"%(doc_id))
+    # cost_time["product"] = round(time.time()-start_time,2)
 
     start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
     product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
@@ -85,9 +150,13 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
 
     '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
     start_time = time.time() #正则角色提取
-    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys, codeName)
+    predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName)
     cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
 
+    start_time = time.time() #正则招标人召回
+    predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
+    cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2)
+
     start_time = time.time() #联系人模型提取
     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
     log("get epc done of doc_id%s"%(doc_id))
@@ -109,7 +178,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
             for _entity in list_entity:
                 # print('keyword:',keyword, '_entity.notes :',_entity.notes)
                 if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label==2:
-                    if channel_dic['docchannel'] == "招标公告":
+                    # if channel_dic['docchannel'] == "招标公告":
+                    if re.search('中标|成交|中选|中价|中租||结果|入围', title+list_articles[0].content[:100])==None:
                         _entity.values[0] = 0.51
                         _entity.set_Money(0, _entity.values)  #2021/11/18 根据公告类别把费用改为招标或中投标金额
                     else:
@@ -124,6 +194,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
     # 依赖句子顺序
     start_time = time.time()  # 实体链接
     entityLink.link_entitys(list_entitys)
+    doctitle_refine = entityLink.doctitle_refine(title)
+    nlp_enterprise,nlp_enterprise_attachment = entityLink.get_nlp_enterprise(list_entitys[0])
     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
@@ -132,32 +204,56 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
     cost_time["punish"] = round(time.time()-start_time,2)
 
-    if len(product_attrs[1]['demand_info']['data'])>0:
-        for d in product_attrs[1]['demand_info']['data']:
-            for product in set(prem[0]['product']):
-                if product in d['project_name']:
-                    d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
 
     '''修正采购公告表格形式多种采购产品中标价格'''
     if total_product_money>0 and len(prem[0]['prem'])==1:
         for value in prem[0]['prem'].values():
             for l in value['roleList']:
                 try:
-                    if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
-                        l[2] = total_product_money
+                    # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
+                    #     l[2] = total_product_money
+                    #     log('修改中标金额为所有产品总金额')
+                    if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money:
+                        l["role_money"]['money'] = total_product_money
                         log('修改中标金额为所有产品总金额')
                 except Exception as e:
                     log('表格产品价格修正中标价格报错:%s'%e)
 
-    '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别'''
+    '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem
     start_time = time.time()
-    content = list_articles[0].content
-    channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
+    # content = list_articles[0].content
+    # channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
+    channel_dic = predictor.getPredictor("channel").predict_merge(title,list_sentences[0], text, list_articles[0].bidway, prem[0], original_docchannel)
     cost_time["rule_channel"] = round(time.time()-start_time,2)
 
+    start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
+    fail = channel_dic['docchannel']['docchannel'] == "废标公告"
+    fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
+    # predictor.getPredictor("product").predict(list_sentences, list_entitys)
+    log("get product done of doc_id%s"%(doc_id))
+    cost_time["product"] = round(time.time()-start_time,2)
+    prem[0].update(getAttributes.getOtherAttributes(list_entitys[0]))
+
+    '''公告无表格格式时,采购意向预测'''  #依赖 docchannel结果 依赖产品及prem
+    if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
+        product_attrs = predictor.getPredictor("product_attrs").predict_without_table(product_attrs, list_sentences,
+                                                                                      list_entitys,codeName,prem,text,page_time)
+    if len(product_attrs[1]['demand_info']['data'])>0:
+        for d in product_attrs[1]['demand_info']['data']:
+            for product in set(prem[0]['product']):
+                if product in d['project_name'] and product not in d['product']:
+                    d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
+
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason)
+    data_res["doctitle_refine"] = doctitle_refine
+    data_res["nlp_enterprise"] = nlp_enterprise
+    data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
+    # 要素的个数
+    data_res['extract_count'] = extractCount(data_res)
+    # 是否有表格
+    data_res['exist_table'] = 1 if re.search("<td",text) else 0
     data_res["cost_time"] = cost_time
     data_res["success"] = True
 

+ 144 - 31
BiddingKG/dl/interface/getAttributes.py

@@ -348,18 +348,25 @@ def get_legal_comba(list_entity,dict_role_combination):
 
 def get_dict_entity_prob(list_entity,on_value=0.5):
     dict_pack_entity_prob = {}
-    for entity in list_entity:
-        if entity.entity_type in ['org','company']:
-            values = entity.values
-            role_prob = float(values[int(entity.label)])
-            _key = entity.packageName+"$$"+str(entity.label)
-            if role_prob>=on_value and str(entity.label)!="5":
-                _key_prob = _key+"$text$"+entity.entity_text
-                if _key_prob in dict_pack_entity_prob:
-                    if role_prob>dict_pack_entity_prob[_key_prob][1]:
+    for in_attachment in [False,True]:
+        identified_role = []
+        if in_attachment==True:
+            identified_role = [value[0] for value in dict_pack_entity_prob.values()]
+        for entity in list_entity:
+            if entity.entity_type in ['org','company'] and entity.in_attachment==in_attachment:
+                values = entity.values
+                role_prob = float(values[int(entity.label)])
+                _key = entity.packageName+"$$"+str(entity.label)
+                if role_prob>=on_value and str(entity.label)!="5":
+                    _key_prob = _key+"$text$"+entity.entity_text
+                    if in_attachment == True:
+                        if entity.entity_text in identified_role:
+                            continue
+                    if _key_prob in dict_pack_entity_prob:
+                        if role_prob>dict_pack_entity_prob[_key_prob][1]:
+                            dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
+                    else:
                         dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
-                else:
-                    dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
     return dict_pack_entity_prob
 
 
@@ -437,13 +444,17 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
         return None
     PackageList,PackageSet,dict_PackageCode = pack
 
-
     #拿到所有可能的情况
     dict_role_combination = {}
   # print(PackageList)
     #拿到各个实体的packageName,packageCode
     for entity in list_entity:
         if entity.entity_type in ['org','company']:
+            #限制附件里角色values[label]最大概率prob
+            max_prob = 0.85
+            if str(entity.label)!="5" and entity.in_attachment:
+                if entity.values[entity.label]>max_prob:
+                    entity.values[entity.label] = max_prob
             #过滤掉字数小于3个的实体
             if len(entity.entity_text)<=3:
                 continue
@@ -824,7 +835,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     packDict[packageName]["roleList"][i].money = money.entity_text
                     packDict[packageName]["roleList"][i].money_prob = money_prob
                     packDict[packageName]["roleList"][i].money_unit = money.money_unit
-                elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or money.notes in ['大写']: # 2021/7/20改为优先选择大写金额,
+                elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or (money.notes in ['大写'] and money.in_attachment==False): # 2021/7/20改为优先选择大写金额,
                     # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
                     # print('链接金额备注 ',money.notes, money.entity_text, money.values)
                     packDict[packageName]["roleList"][i].money = money.entity_text
@@ -1098,7 +1109,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
         # km算法分配求解
         dispatch_result = dispatch(temp_match_list)
         dispatch_result = sorted(dispatch_result, key=lambda x: (x[0].sentence_index,x[0].begin_index))
-        # print(dispatch_result)
         for match in dispatch_result:
             _entity = match[0]
             _attribute = match[1]
@@ -1106,6 +1116,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 _entity.pointer_money = _attribute
                 packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
                                                "money-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
+                # print(_entity.entity_text,_attribute.entity_text)
                 if packagePointer is None:
                     packageName_entity = "Project"
                 else:
@@ -1135,7 +1146,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 else:
                     packageName_entity = packagePointer.entity_text
                 addRatioByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
-
     ''''''
     # 通过模型分类的招标/代理联系人
     list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
@@ -1239,7 +1249,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
                        '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
                        '[2-9]\d{6,7}')
-    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
+    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
     email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
                             "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
     phone_entitys = []
@@ -1357,11 +1367,29 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     else:
         # 公告大于maxlen时,分段预测
         start = 0
+        # print("len(pre_data)",len(pre_data))
+        temp_data = []
+        deal_data = 0
         while start<len(pre_data):
             _pre_data = pre_data[start:start+maxlen]
             _text_data = text_data[start:start+maxlen]
-            relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
+            if relationExtraction_model.check_data(_pre_data):
+                temp_data.append((_text_data,_pre_data))
+            else:
+                if temp_data:
+                    deal_data += len(temp_data)
+                    if deal_data>3:
+                        break
+                    for _text_data, _pre_data in temp_data:
+                        relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
+                    temp_data = []
             start = start + maxlen - 120
+        # print("预测数据:",len(temp_data))
+        # if len(temp_data)<=6:
+        #     for _text_data,_pre_data in temp_data:
+        #         relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
+        # else:
+        #     relation_list = []
         # 去重结果
         relation_list = list(set(relation_list))
     # print(relation_list)
@@ -1377,6 +1405,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
     linked_company = set()
     linked_person = set()
+    linked_connetPerson = set()
+    linked_phone = set()
     for predicate in ["rel_address","rel_phone","rel_person"]:
         _match_list = []
         _match_combo = []
@@ -1444,6 +1474,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 break
                 if is_continue: continue
                 combo[0].person_phone.append(combo[1])
+                linked_connetPerson.add(combo[0])
+                linked_phone.add(combo[1])
                 if combo[0].label in [1,2]:
                     if PackDict.get("Project"):
                         for i in range(len(PackDict["Project"]["roleList"])):
@@ -1452,6 +1484,68 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
                                 break
                 # print(3,combo[0].entity_text,combo[1].entity_text)
+    # "联系人——联系电话" 链接规则补充
+    person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
+    person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
+    t_match_list = []
+    for ent_idx in range(len(person_phone_EntityList)):
+        entity = person_phone_EntityList[ent_idx]
+        if entity.entity_type=="person":
+            match_nums = 0
+            person_nums = 0  # 经过其他中联系人的数量
+            byNotPerson_match_nums = 0  # 跟在联系人后面的属性
+            phone_nums = 0 # 经过电话的数量
+            for after_index in range(ent_idx + 1, min(len(person_phone_EntityList), ent_idx + 8)):
+                after_entity = person_phone_EntityList[after_index]
+                if after_entity.entity_type == "phone":
+                    distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                            tokens_num_dict[entity.sentence_index] + entity.end_index)
+                    phone_nums += 1
+                    if distance>100 or phone_nums>=4:
+                        break
+                    sentence_distance = after_entity.sentence_index - entity.sentence_index
+                    value = (-1 / 2 * (distance ** 2)) / 10000
+                    if sentence_distance == 0:
+                        if distance < 80:
+                            # value = (-1 / 2 * (distance ** 2)) / 10000
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if not person_nums:
+                                byNotPerson_match_nums += 1
+                            else:
+                                break
+                    else:
+                        if distance < 50:
+                            # value = (-1 / 2 * (distance ** 2)) / 10000
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if not person_nums:
+                                byNotPerson_match_nums += 1
+                            else:
+                                break
+                else:
+                    person_nums += 1
+            # 前向查找属性
+            if ent_idx != 0 and (not match_nums or not byNotPerson_match_nums):
+                previous_entity = person_phone_EntityList[ent_idx - 1]
+                if previous_entity.entity_type == 'phone':
+                    # if previous_entity.sentence_index == entity.sentence_index:
+                    distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
+                            tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index)
+                    if distance < 40:
+                        # 前向 没有 /10000
+                        value = (-1 / 2 * (distance ** 2))
+                        t_match_list.append(Match(entity, previous_entity, value))
+    # km算法分配求解(person-phone)
+    t_match_list = [mat for mat in t_match_list if mat.main_role not in linked_connetPerson and mat.attribute not in linked_phone]
+    personphone_result = dispatch(t_match_list)
+    personphone_result = sorted(personphone_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
+    for match in personphone_result:
+        _person = match[0]
+        _phone = match[1]
+        if not _person.person_phone:
+            _person.person_phone = []
+        _person.person_phone.append(_phone)
     # 多个招标人/代理人或者别称
     for idx in range(1,len(pre_entity)):
         _pre_entity = pre_entity[idx]
@@ -1852,7 +1946,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
         for k in PackDict.keys():
             for i in range(len(PackDict[k]["roleList"])):
                 if PackDict[k]["roleList"][i].role_name == "tenderee":
-                    if not PackDict[k]["roleList"][i].linklist:
+                    # if not PackDict[k]["roleList"][i].linklist:
                         if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0:
                             if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0 and person_ not in winter_contact:
                                 if not phone_:
@@ -1862,7 +1956,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                     PackDict[k]["roleList"][i].linklist.append((person_, p))
                                 is_update = True
                 elif PackDict[k]["roleList"][i].role_name == "agency":
-                    if not PackDict[k]["roleList"][i].linklist:
+                    # if not PackDict[k]["roleList"][i].linklist:
                         if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1 and person_ not in winter_contact:
                             if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0:
                                 if not phone_:
@@ -1895,7 +1989,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             for _person in company_entity.pointer_person:
                 linked_person.append(_person)
                 linked_persons_with.append(company_entity)
-
     # 一个公司对应多个联系人的补充
     person_entitys = [entity for entity in list_entity if entity.entity_type=='person']
     person_entitys = person_entitys[::-1]
@@ -2358,7 +2451,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
                     PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
                     # print('招标金额校正中标金额')
-
+    # 2022/04/01 #增加判断中标金额是否远小于招标金额逻辑,比例相差10000倍左右(中标金额“万”单位丢失或未识别)
+    for pack in PackDict.keys():
+        for i in range(len(PackDict[pack]["roleList"])):
+            if PackDict[pack]["tendereeMoney"] > 0 and float(PackDict[pack]["roleList"][i].money) > 0.:
+                if float(PackDict[pack]["roleList"][i].money) < 1000 and \
+                        float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)>=9995 and \
+                        float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)<11000:
+                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) * 10000
     # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
     for pack in PackDict.keys():
         tmp_moneys = []
@@ -2382,7 +2482,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     if not get_contacts:
                         # 根据大纲Outline类召回联系人
                         for outline in list_outline:
-                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人",outline.outline_summary):
+                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary):
                                 for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]:
                                     if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[
                                         t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
@@ -2417,12 +2517,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
                             get_contacts = True
                     if not get_contacts:
-                        # 通过大纲直接取电话
+                        # 通过大纲Outline类直接取电话
                         if len(new_split_list) > 1:
                             for _start, _end in new_split_list:
                                 temp_sentence = _content[_start:_end]
                                 sentence_outline = temp_sentence.split(",::")[0]
-                                if re.search("联系人|联系方|联系方式|联系电话|电话|负责人", sentence_outline):
+                                if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
                                     sentence_phone = phone.findall(temp_sentence)
                                     if sentence_phone:
                                         PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
@@ -2736,6 +2836,7 @@ def getTimeAttributes(list_entity,list_sentence):
             result_dict[time_type] = list_time[0][0]
     return result_dict
 
+
 def getOtherAttributes(list_entity):
     dict_other = {"moneysource":"",
                   "person_review":[],
@@ -2750,14 +2851,19 @@ def getOtherAttributes(list_entity):
         elif entity.entity_type=='moneysource':
             dict_other["moneysource"] = entity.entity_text
         elif entity.entity_type=='serviceTime':
-            dict_other["serviceTime"] = entity.entity_text
+            if re.search("日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[\-\./]\d{1,2}", entity.entity_text):
+                if not entity.in_attachment:
+                    dict_other["serviceTime"] = entity.entity_text
+                else:
+                    if not dict_other["serviceTime"]:
+                        dict_other["serviceTime"] = entity.entity_text
         elif entity.entity_type=="person" and entity.label ==4:
             dict_other["person_review"].append(entity.entity_text)
         elif entity.entity_type=='product':
             dict_other["product"].append(entity.entity_text)
         elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
-                dict_other["total_tendereeMoney"] = float(entity.entity_text)
-                dict_other["total_tendereeMoneyUnit"] = entity.money_unit
+            dict_other["total_tendereeMoney"] = float(entity.entity_text)
+            dict_other["total_tendereeMoneyUnit"] = entity.money_unit
 
     dict_other["product"] = list(set(dict_other["product"]))
     return dict_other
@@ -2775,10 +2881,17 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
     result = []
     for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
         RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
-        result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
-                           **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
-                              "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
-                              "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
+        result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
+                           **getTimeAttributes(list_entity, list_sentence),
+                           **{"fingerprint": list_article.fingerprint,
+                              "match_enterprise": list_article.match_enterprise,
+                              "match_enterprise_type": list_article.match_enterprise_type,
+                              "process_time": getCurrent_date(),
+                              "attachmentTypes": list_article.attachmentTypes, "bidway": list_article.bidway}))
+        # result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
+        #                    **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
+        #                       "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
+        #                       "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
     return result
 
 

+ 12 - 4
BiddingKG/dl/interface/modelFactory.py

@@ -258,7 +258,8 @@ class Model_relation_extraction():
             last_sentence_index = key
         return text_data, pre_data
 
-    def predict(self,text_in, words, rate=0.5):
+    def check_data(self, words):
+        # 检查数据是否包含可预测的subject和object
         # 没有需要预测的链接属性,直接return
         company_relation = 0
         person_relation = 0
@@ -268,12 +269,19 @@ class Model_relation_extraction():
             person_relation += 1
             if company_relation:
                 company_relation += 1
-        if '<location>' in words and company_relation:
-            company_relation += 1
+        # 暂时不考虑地址location实体
+        # if '<location>' in words and company_relation:
+        #     company_relation += 1
         if '<phone>' in words and company_relation:
             person_relation += 1
         if company_relation < 2 and person_relation < 2:
-            return []
+            return False
+        return True
+
+    def predict(self,text_in, words, rate=0.5):
+        # 没有需要预测的链接属性,直接return
+        # if self.check_data(words):
+        #     return []
         # 使用模型预测
         triple_list = []
         # print("tokens:",words)

+ 750 - 100
BiddingKG/dl/interface/predictor.py

@@ -32,6 +32,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
               "epc":{"predictor":None,"Lock":RLock()},
               "roleRule":{"predictor":None,"Lock":RLock()},
               "roleRuleFinal":{"predictor":None,"Lock":RLock()},
+              "tendereeRuleRecall":{"predictor":None,"Lock":RLock()},
                   "form":{"predictor":None,"Lock":RLock()},
                   "time":{"predictor":None,"Lock":RLock()},
                   "punish":{"predictor":None,"Lock":RLock()},
@@ -57,6 +58,8 @@ def getPredictor(_type):
                     dict_predictor[_type]["predictor"] = RoleRulePredictor()
                 if _type == "roleRuleFinal":
                     dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
+                if _type == "tendereeRuleRecall":
+                    dict_predictor[_type]["predictor"] = TendereeRuleRecall()
                 if _type == "form":
                     dict_predictor[_type]["predictor"] = FormPredictor()
                 if _type == "time":
@@ -332,9 +335,9 @@ class CodeNamePredict():
                         else:
                             begin = iter.span()[0]-get_len
                         end = iter.span()[1]+get_len
-                        code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
-                        code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
-                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]],entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
+                        code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
+                        code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""))
+                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
                         temp_entitys.append(_entity)
                     #print("code",code_text)
                     if len(code_x)>0:
@@ -363,7 +366,7 @@ class CodeNamePredict():
                             with self.sess_codesplit.as_default():
                                 with self.sess_codesplit.graph.as_default():
                                     inputs_code,outputs_code = self.getModel_code()
-                                    predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]},MAX_BATCH=2)[0]
+                                    predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})[0]
 
                                     #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
                                     #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
@@ -1100,21 +1103,21 @@ class RoleRulePredictor():
     def __init__(self):
         # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
         self.pattern_tenderee_left = "(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
-                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
-                                "[))]?(信息[,:])?(名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
-        self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询价|评选|谈判|邀标|邀请|洽谈|约谈)" \
+                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
+                                "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
+        self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
                                      "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
-                                     "(名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
+                                     "(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托))"
         self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^现委托|^的\w{2,10}正在进行)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
-        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
+        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
         self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
                                         "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
                                         "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
-        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
+        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
         # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
@@ -1123,10 +1126,10 @@ class RoleRulePredictor():
 
         # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
 
-        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
+        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
         self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
         
-        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
+        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
 
         self.pattern_whole = [self.pattern_tenderee_left,
@@ -1266,7 +1269,7 @@ class RoleRulePredictor():
                                                 if _v_group is not None and _v_group != "":
                                                     _role = _group.split("_")[0]
                                                     if _role == "tendereeORagency":   # 2022/3/9 新增不确定招标代理判断逻辑
-                                                        print('p_entity_sentenceindex:', p_entity.sentence_index)
+                                                        # print('p_entity_sentenceindex:', p_entity.sentence_index)
                                                         if p_entity.sentence_index>=1:  # 只在第一句进行这种模糊匹配
                                                             continue
                                                         if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
@@ -1278,9 +1281,8 @@ class RoleRulePredictor():
                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|交易服务单位',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
-                                                                                                        list_spans[
-                                                                                                            0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                                                                                                        list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                                   "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
@@ -1385,70 +1387,96 @@ class RoleRulePredictor():
 
 '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
 class RoleRuleFinalAdd():
-    def predict(self, list_articles, list_entitys, list_codenames):
-        text_end = list_articles[0].content[-40:]
+    def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
+        # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
+        main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
+        end_tokens = []
+        for sentence in main_sentences[-5:]:
+            end_tokens.extend(sentence.tokens)
+        text_end = "".join(end_tokens[-30:])
+        # print(text_end)
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
-        sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
+        sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
         sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
-        sear_ent3 = re.search('(报名咨询|收货地点|送货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
-
-        if sear_ent or sear_ent2 or sear_ent3:
-            if sear_ent3:
-                ent_re = sear_ent3.group(2)
-            elif sear_ent2:
-                ent_re = sear_ent2.group(2)
-            else:
-                ent_re = sear_ent.group(1)
-            ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
-            tenderee_notfound = True
-            agency_notfound = True
-            ents = []
-            for ent in list_entitys[0]:
-                if ent.entity_type in ['org', 'company']:
-                    if ent.label == 0:
-                        tenderee_notfound = False
-                    elif ent.label == 1:
-                        agency_notfound = False
-                    elif ent.label == 5:
-                        ents.append(ent)
-            if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
-                                              or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
-                n = 0
-                for i in range(len(ents) - 1, -1, -1):
-                    n += 1
-                    if n > 3 and sear_ent: # 文章末尾角色加日期这种只找后三个实体
-                        break
-                    if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
-                        ents[i].label = 0
-                        ents[i].values[0] = 0.5
-                        # log('正则最后补充实体: %s'%(ent_re))
-                        break
-            elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
-                n = 0
-                for i in range(len(ents) - 1, -1, -1):
-                    n += 1
-                    if n > 3 and sear_ent:  # 文章末尾角色加日期这种只找后三个实体
-                        break
-                    if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
-                        ents[i].label = 1
-                        ents[i].values[1] = 0.5
-                        # log('正则最后补充实体: %s'%(ent_re))
-                        break
-
+        sear_ent3 = re.search('(报名咨询|[收送交]货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
+        sear_ent4 = re.search('(发布(?:人|单位|机构))[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
+        sear_list = [sear_ent4 , sear_ent3 , sear_ent2 , sear_ent]
+
+        tenderee_notfound = True
+        agency_notfound = True
+        ents = []
+        for ent in list_entitys[0]:
+            if ent.entity_type in ['org', 'company']:
+                if ent.label == 0:
+                    tenderee_notfound = False
+                elif ent.label == 1:
+                    agency_notfound = False
+                elif ent.label == 5:
+                    ents.append(ent)
+        if sear_ent or sear_ent2 or sear_ent3 or sear_ent4:
+            for _sear_ent in [_sear for _sear in sear_list if _sear]:
+                # if sear_ent4:
+                #     ent_re = sear_ent4.group(2)
+                # elif sear_ent3:
+                #     ent_re = sear_ent3.group(2)
+                # elif sear_ent2:
+                #     ent_re = sear_ent2.group(2)
+                # else:
+                #     ent_re = sear_ent.group(1)
+                if _sear_ent==sear_ent4:
+                    ent_re = _sear_ent.group(2)
+                elif _sear_ent==sear_ent3:
+                    ent_re = _sear_ent.group(2)
+                elif _sear_ent==sear_ent2:
+                    ent_re = _sear_ent.group(2)
+                else:
+                    ent_re = _sear_ent.group(1)
+                # print('ent_re', ent_re)
+                ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
+
+                if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
+                                                  or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
+                    n = 0
+                    for i in range(len(ents) - 1, -1, -1):
+                        if not ents[i].in_attachment:
+                            n += 1
+                        if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
+                            break
+                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
+                            ents[i].label = 0
+                            ents[i].values[0] = 0.5
+                            tenderee_notfound = False
+                            # log('正则最后补充实体: %s'%(ent_re))
+                            break
+                elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
+                    n = 0
+                    for i in range(len(ents) - 1, -1, -1):
+                        if not ents[i].in_attachment:
+                            n += 1
+                        if n > 3 and _sear_ent==sear_ent:  # 文章末尾角色加日期这种只找后三个实体
+                            break
+                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
+                            ents[i].label = 1
+                            ents[i].values[1] = 0.5
+                            agency_notfound = False
+                            # log('正则最后补充实体: %s'%(ent_re))
+                            break
+                if not tenderee_notfound:
+                    break
 
         elif list_codenames[0]['name'] != "":  #把标题包含的公司实体作为招标人
-            tenderee_notfound = True
-            ents = []
-            for ent in list_entitys[0]:
-                if ent.entity_type in ['org', 'company']:
-                    if ent.label == 0:
-                        tenderee_notfound = False
-                    elif ent.label == 1:
-                        agency_notfound = False
-                    elif ent.label == 5:
-                        ents.append(ent)
+            # tenderee_notfound = True
+            # ents = []
+            # for ent in list_entitys[0]:
+            #     if ent.entity_type in ['org', 'company']:
+            #         if ent.label == 0:
+            #             tenderee_notfound = False
+            #         elif ent.label == 1:
+            #             agency_notfound = False
+            #         elif ent.label == 5:
+            #             ents.append(ent)
             if tenderee_notfound == True:
-                print('list_codenames',list_codenames[0]['name'])
+                # print('list_codenames',list_codenames[0]['name'])
                 for ent in ents:
                     if ent.entity_text in list_codenames[0]['name']:
                         ent.label = 0
@@ -1456,7 +1484,179 @@ class RoleRuleFinalAdd():
                         # log('正则召回标题中包含的实体:%s'%ent.entity_text)
                         break
 
-
+# 招标人角色召回规则
+class TendereeRuleRecall():
+    def __init__(self):
+        self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|"
+                                        "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::][^。;,]{,5}$")
+
+        self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|"
+                                        "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|"
+                                         "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|"
+                                         "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|"
+                                         "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|"
+                                         "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)")
+        self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)")
+        self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P<project>[^。;,?!::]+)")
+        # 公告主语判断规则
+        self.subject = re.compile("[我本][院校局]")
+        # 未识别实体召回正则
+        self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
+                                        "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
+                                        "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
+        self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
+                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
+                                "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
+        # 未识别实体尾部判断
+        self.unrecognized_end1 = re.compile(".{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心)")
+        self.unrecognized_end2 = re.compile(".{4,}(?:署|局|厅|处|室|科|部|站|所|股|行)")
+
+    def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
+        # tenderee_notfound = True
+        # agency_notfound = True
+        self.get_tenderee = False
+        ents = []
+        list_name = []
+        for ent in list_entitys[0]:
+            if ent.entity_type == 'name':
+                list_name.append(ent.entity_text)
+            if ent.entity_type in ['org', 'company']:
+                if ent.label == 0:
+                    # tenderee_notfound = False
+                    self.get_tenderee = True
+                # elif ent.label == 1:
+                #     agency_notfound = False
+                elif ent.label == 5:
+                    ents.append(ent)
+        if not self.get_tenderee:
+            self.entity_context_rule(ents,list_name,list_sentences)
+        if not self.get_tenderee:
+            self.subject_rule(ents,list_articles,list_sentences)
+        if not self.get_tenderee:
+            self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
+        if not self.get_tenderee:
+            self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
+
+    #entity上下文正则判断
+    def entity_context_rule(self,entitys,list_name,list_sentences):
+        for ent in entitys:
+            _sentence = list_sentences[0][ent.sentence_index]
+            _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
+                               end_index=ent.end_index, size=40, center_include=True,
+                               word_flag=True, use_text=True,
+                               text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
+            if re.search(self.tenderee_left,_span[0]):
+                ent.label = 0
+                ent.values[0] = 0.5 + ent.values[0] / 10
+                self.get_tenderee = True
+            elif re.search(self.tenderee_right,_span[2]):
+                ent.label = 0
+                ent.values[0] = 0.5 + ent.values[0] / 10
+                self.get_tenderee = True
+            elif re.search(self.tenderee_right2, _span[2]):
+                ent.label = 0
+                ent.values[0] = 0.5 + ent.values[0] / 10
+                self.get_tenderee = True
+            elif list_name:
+                pj_name = re.search(self.tenderee_right3, _span[2])
+                if pj_name:
+                    pj_name = pj_name.groupdict()["project"]
+                    for _name in list_name:
+                        if _name in pj_name:
+                            ent.label = 0
+                            ent.values[0] = 0.5
+                            self.get_tenderee = True
+                            break
+    # 公告主语判断
+    def subject_rule(self, entitys,list_articles,list_sentences):
+        content = list_articles[0].content.split('##attachment##')[0]
+        if re.search(self.subject,content):
+            _subject = re.search(self.subject,content).group()
+            for ent in entitys:
+                if re.search("院",_subject) and re.search("医院|学院",ent.entity_text):
+                    ent.label = 0
+                    ent.values[0] = 0.5 + ent.values[0] / 10
+                    self.get_tenderee = True
+                elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text):
+                    ent.label = 0
+                    ent.values[0] = 0.5 + ent.values[0] / 10
+                    self.get_tenderee = True
+                elif re.search("局", _subject) and re.search("局", ent.entity_text):
+                    _sentence = list_sentences[0][ent.sentence_index]
+                    _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
+                                       end_index=ent.end_index, size=20, center_include=True,
+                                       word_flag=True, use_text=True,
+                                       text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
+                    if not re.search("监督|投诉",_span[0][-10:]):
+                        ent.label = 0
+                        ent.values[0] = 0.5 + ent.values[0] / 10
+                        self.get_tenderee = True
+
+    # 正则召回未识别实体
+    def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
+        list_sentence = list_sentences[0]
+        for in_attachment in [False,True]:
+            for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
+                sentence_text = sentence.sentence_text
+                tokens = sentence.tokens
+                doc_id = sentence.doc_id
+                in_attachment = sentence.in_attachment
+                list_tokenbegin = []
+                begin = 0
+                for i in range(0, len(tokens)):
+                    list_tokenbegin.append(begin)
+                    begin += len(str(tokens[i]))
+                list_tokenbegin.append(begin + 1)
+                for _match in re.finditer(pattern,sentence_text):
+                    _groupdict = _match.groupdict()
+                    _match_text = _match.group()
+                    _unrecognized_text = _groupdict["unrecognized"]
+                    # print(_unrecognized_text)
+                    # if _match_text[-1] in [':',':']:
+                    #     _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
+                    #     if not _unrecognized:
+                    #         _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
+                    #     if _unrecognized:
+                    #         _unrecognized = _unrecognized.group()
+                    #     else:
+                    #         continue
+                    # else:
+                    #     _unrecognized = _unrecognized_text
+                    _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
+                    if not _unrecognized:
+                        _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
+                    if _unrecognized:
+                        _unrecognized = _unrecognized.group()
+                    else:
+                        continue
+                    # print(_unrecognized)
+                    if re.search("某",_unrecognized):
+                        continue
+                    begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
+                    for j in range(len(list_tokenbegin)):
+                        if list_tokenbegin[j] == begin_index_temp:
+                            begin_index = j
+                            break
+                        elif list_tokenbegin[j] > begin_index_temp:
+                            begin_index = j - 1
+                            break
+                    index = begin_index_temp + len(_unrecognized)
+                    end_index_temp = index
+                    for j in range(begin_index, len(list_tokenbegin)):
+                        if list_tokenbegin[j] >= index:
+                            end_index = j - 1
+                            break
+                    entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
+                    entity_text = _unrecognized
+                    new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
+                               begin_index_temp, end_index_temp, in_attachment=in_attachment)
+                    new_entity.label = 0
+                    new_entity.values = [on_value,0,0,0,0,0]
+                    list_entitys[0].append(new_entity)
+                    self.get_tenderee = True
+            if self.get_tenderee:
+                list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
+                break
 
 # 时间类别
 class TimePredictor():
@@ -1867,7 +2067,7 @@ class ProductAttributesPredictor():
                 order_end = "%s-%s-%s" % (y, m, num)
             return order_begin, order_end
 
-        t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text)
+        t1 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})月?$', text)
         if t1:
             year = t1.group(1)
             month = t1.group(3)
@@ -1879,7 +2079,7 @@ class ProductAttributesPredictor():
             order_begin = "%s-%s-01" % (year, month)
             order_end = "%s-%s-%s" % (year, month, num)
             return order_begin, order_end
-        t2 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)(\d{1,2})日?$', text)
+        t2 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})(月|/|\.|-)(\d{1,2})日?$', text)
         if t2:
             y = t2.group(1)
             m = t2.group(3)
@@ -1888,8 +2088,31 @@ class ProductAttributesPredictor():
             d = '0'+d if len(d)<2 else d
             order_begin = order_end = "%s-%s-%s"%(y,m,d)
             return order_begin, order_end
-        all_match = re.finditer('^(?P<y1>\d{4})(年|/|.)(?P<m1>\d{1,2})(?:(月|/|.)(?:(?P<d1>\d{1,2})日)?)?'
-                                '(到|至|-)(?:(?P<y2>\d{4})(年|/|.))?(?P<m2>\d{1,2})(?:(月|/|.)'
+        # 时间样式:"202105"
+        t3 = re.search("^(20\d{2})(\d{1,2})$",text)
+        if t3:
+            year = t3.group(1)
+            month = t3.group(2)
+            if int(month)>0 and int(month)<=12:
+                num = self.get_monthlen(year, month)
+                if len(month) < 2:
+                    month = '0' + month
+                if len(num) < 2:
+                    num = '0' + num
+                order_begin = "%s-%s-01" % (year, month)
+                order_end = "%s-%s-%s" % (year, month, num)
+                return order_begin, order_end
+        # 时间样式:"20210510"
+        t4 = re.search("^(20\d{2})(\d{2})(\d{2})$", text)
+        if t4:
+            year = t4.group(1)
+            month = t4.group(2)
+            day = t4.group(3)
+            if int(month) > 0 and int(month) <= 12 and int(day)>0 and int(day)<=31:
+                order_begin = order_end = "%s-%s-%s"%(year,month,day)
+                return order_begin, order_end
+        all_match = re.finditer('^(?P<y1>\d{4})(年|/|\.)(?P<m1>\d{1,2})(?:(月|/|\.)(?:(?P<d1>\d{1,2})日)?)?'
+                                '(到|至|-)(?:(?P<y2>\d{4})(年|/|\.))?(?P<m2>\d{1,2})(?:(月|/|\.)'
                                 '(?:(?P<d2>\d{1,2})日)?)?$', text)
         y1 = m1 = d1 = y2 = m2 = d2 = ""
         found_math = False
@@ -1981,7 +2204,7 @@ class ProductAttributesPredictor():
                 elif re.search('预算', items[j]):
                     header_dic['预算'] = j
                     budget = items[j]
-                elif re.search('时间|采购实施月份|采购月份', items[j]):
+                elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
                     header_dic['时间'] = j
                     order_time = items[j]
 
@@ -2046,14 +2269,24 @@ class ProductAttributesPredictor():
                         elif re.search('采购预算|预算金额', col0_l[i]):
                             header_list2.append(col0_l[i])
                             budget = col1_l[i]
-                            if '万元' in col0_l[i] and '万' not in budget:
-                                budget += '万元'
-                            budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
-                            budget = str(getUnifyMoney(budget))
-                        elif re.search('采购时间|采购实施月份|采购月份', col0_l[i]):
+                            re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
+                            if re_price:
+                                budget = re_price[0]
+                                if '万元' in col0_l[i] and '万' not in budget:
+                                    budget += '万元'
+                                budget = str(getUnifyMoney(budget))
+                            else:
+                                budget = ""
+                        elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
                             header_list2.append(col0_l[i])
                             order_time = col1_l[i].strip()
                             order_begin, order_end = self.fix_time(order_time, html, page_time)
+                    if order_begin != "" and order_end!="":
+                        order_begin_year = int(order_begin.split("-")[0])
+                        order_end_year = int(order_end.split("-")[0])
+                        # 限制附件错误识别时间
+                        if order_begin_year>=2050 or order_end_year>=2050:
+                            order_begin = order_end = ""
                     if product!= "" and demand != "" and budget!="" and order_begin != "":
                         link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
                                 'order_begin': order_begin, 'order_end': order_end}
@@ -2112,10 +2345,15 @@ class ProductAttributesPredictor():
                         if id3 != "":
                             if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
                                 unitPrice = tds[id3]
-                                if '万元' in header_list[2] and '万' not in unitPrice:
-                                    unitPrice += '万元'
-                                unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
-                                unitPrice = str(getUnifyMoney(unitPrice))
+                                re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?",unitPrice)
+                                if re_price:
+                                    unitPrice = re_price[0]
+                                    if '万元' in header_list[2] and '万' not in unitPrice:
+                                        unitPrice += '万元'
+                                    # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
+                                    unitPrice = str(getUnifyMoney(unitPrice))
+                                else:
+                                    unitPrice = ""
                             else:
                                 unitPrice = ""
                         if id4 != "":
@@ -2136,10 +2374,14 @@ class ProductAttributesPredictor():
                         if id7 != "":
                             if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
                                 budget = tds[id7]
-                                if '万元' in header_list2[2] and '万' not in budget:
-                                    budget += '万元'
-                                budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
-                                budget = str(getUnifyMoney(budget))
+                                re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
+                                if re_price:
+                                    budget = re_price[0]
+                                    if '万元' in header_list[2] and '万' not in budget:
+                                        budget += '万元'
+                                    budget = str(getUnifyMoney(budget))
+                                else:
+                                    budget = ""
                             else:
                                 budget = ""
                         if id8 != "":
@@ -2157,7 +2399,13 @@ class ProductAttributesPredictor():
                                         total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', ''))
                                     except:
                                         log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
-                        if budget != "" and order_time != "" :
+                        if order_begin != "" and order_end != "":
+                            order_begin_year = int(order_begin.split("-")[0])
+                            order_end_year = int(order_end.split("-")[0])
+                            # 限制附件错误识别时间
+                            if order_begin_year >= 2050 or order_end_year >= 2050:
+                                order_begin = order_end = ""
+                        if budget != "" and order_time != "":
                             link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
                             if link not in demand_link:
                                 demand_link.append(link)
@@ -2174,6 +2422,42 @@ class ProductAttributesPredictor():
             demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
         return [attr_dic, demand_dic], total_product_money
 
+    def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
+        if len(prem[0]['prem'])==1:
+            list_sentence = list_sentences[0]
+            list_entity = list_entitys[0]
+            _data = product_attrs[1]['demand_info']['data']
+            re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
+            order_times = []
+            for entity in list_entity:
+                if entity.entity_type=='time':
+                    sentence = list_sentence[entity.sentence_index]
+                    s = spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index,
+                                   end_index=entity.end_index,size=20)
+                    entity_left = "".join(s[0])
+                    if re.search(re_bidding_time,entity_left):
+                        time_text = entity.entity_text.strip()
+                        standard_time = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2})日?)")
+                        time_match = re.search(standard_time,time_text)
+                        if time_match:
+                            time_text = time_match.group()
+                        order_times.append(time_text)
+            # print(order_times)
+            order_times = [tuple(self.fix_time(order_time, html, page_time)) for order_time in order_times]
+            order_times = [order_time for order_time in order_times if order_time[0]!=""]
+            if len(set(order_times))==1:
+                order_begin,order_end = order_times[0]
+                project_name = codeName[0]['name']
+                pack_info = [pack for pack in prem[0]['prem'].values()]
+                budget = pack_info[0].get('tendereeMoney',0)
+                product = prem[0]['product']
+                link = {'project_name': project_name, 'product': product, 'demand': project_name, 'budget': budget,
+                        'order_begin': order_begin, 'order_end': order_end}
+                _data.append(link)
+            product_attrs[1]['demand_info']['data'] = _data
+        return product_attrs
+
+
 # docchannel类型提取
 class DocChannel():
   def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
@@ -2191,13 +2475,60 @@ class DocChannel():
     self.id2type = {k: v for k, v in enumerate(lb_type)}
     self.id2life = {k: v for k, v in enumerate(lb_life)}
 
+    self.load_pattern()
+
+  def load_pattern(self):
+      self.type_dic = {
+            '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
+            '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
+            '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
+            '采招数据': '(采购|招标|代理)(人|机构|单位)|(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' #|变更|答疑|澄清|中标|成交|合同|废标|流标
+        }
+
+      self.title_type_dic = {
+            '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
+            '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
+            '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
+            '采招数据': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标',  # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
+            '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
+        }
+      self.life_dic = {
+            '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
+            '招标预告': '预计(采购|招标)(时间|日期)',
+            '招标公告': '(采购|招标|竞选|报名)条件;报名时间;报名流程;报名方法;报名需提供的材料;参加竞价采购交易资格;(申请人|投标人|供应商|报价人|参选人)的?资格要求;获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件;(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
+            '资审结果': '招标资审公告|评审入围公示|资审及业绩公示|资格后审情况报告|资格后审结果公告|资格后审结果公示|资格预审结果公告|资格预审结果公示|预审公示|预审结果公示',
+            '招标答疑': '现澄清为|现澄清如下|第\d次澄清|答疑澄清公告|异议的回复|(最高(投标)?限价|控制价|拦标价)公示',
+            '公告变更': '原公告(主要)?(信息|内容)|变更[前后]内容|现在?(变更|更正|修改|更改)为|(变更|更正)内容为|更正理由|更正人名称|[、\s](更正信息|更正内容):',
+            '候选人公示': '候选人公示|评标结果公示',
+            '中标信息': '供地结果信息|采用单源直接采购的?情况说明|现将\w{,4}(成交|中标|中选|选定结果|选取结果)\w{2,8}(进行公示|公示如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|(中标(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
+            '中标信息2': '(成交|中标)(日期|时间)[::\s]|成交金额:',
+            '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
+            '合同公告': '合同(公告|公示)信息;合同(公告|公示)日期;合同(公告|公示)内容;合同编号;合同名称;合同签订日期;合同主体;供应商乙方',
+            '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|(按|做|作)(流标|废标)处理)',
+        }
+      self.title_life_dic = {
+            '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
+            '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|供应计划$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
+            '公告变更': '(变更|更正(事项)?|更改|延期|暂停)的?(公告|公示|通知)|变更$|更正$',
+            '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告)',
+            '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销|取消成交)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)',
+            '合同公告': '(合同(成交)?|履约验收|履约|验收结果)(公告|公示|信息|公式)|合同备案|合同书',  # 合同$|
+            '候选人公示': '候选人公示|评标(结果)?公示|中标前?公示|中标预公示',
+            '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)结果|开标(记录|信息|情况)|中标通知书|中标$',
+            # '资审结果': '(资质|资格)(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)(审查|预审)结果(公示)?|资审结果公示|未?入围(公示|公告)|资审及业绩公示',
+            '资审结果': '((资格|资质)(审查|预审|后审|审核|入围项?目?)|资审|入围)结果(公告|公示)?|(资质|资格)(预审|后审|入围)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|未?入围(公示|公告)|资审及业绩公示',
+            '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)',
+        }
+
+      self.wrong_win = '按项目控制价下浮\d%即为成交价|不得确定为(中标|成交)|招标人按下列原则选择中标人|确定成交供应商:|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)|竞拍起止时间:'
+
   def load_life(self,life_model):
     with tf.Graph().as_default() as graph:
       output_graph_def = graph.as_graph_def()
       with open(os.path.dirname(__file__)+life_model, 'rb') as f:
         output_graph_def.ParseFromString(f.read())
         tf.import_graph_def(output_graph_def, name='')
-        print("%d ops in the final graph" % len(output_graph_def.node))
+        # print("%d ops in the final graph" % len(output_graph_def.node))
         del output_graph_def
         sess = tf.Session(graph=graph)
         sess.run(tf.global_variables_initializer())
@@ -2216,7 +2547,7 @@ class DocChannel():
       with open(os.path.dirname(__file__)+type_model, 'rb') as f:
         output_graph_def.ParseFromString(f.read())
         tf.import_graph_def(output_graph_def, name='')
-        print("%d ops in the final graph" % len(output_graph_def.node))
+        # print("%d ops in the final graph" % len(output_graph_def.node))
         del output_graph_def
         sess = tf.Session(graph=graph)
         sess.run(tf.global_variables_initializer())
@@ -2306,7 +2637,20 @@ class DocChannel():
     else:
       return 0
 
-  def predict(self, title='', list_sentence='', web_source_no=''):
+  def predict(self, title='', list_sentence='', web_source_no='', original_docchannel=''):
+    not_extract_dic = {
+        104: '招标文件',
+        106: '法律法规',
+        107: '新闻资讯',
+        108: '拟建项目',
+        109: '展会推广',
+        110: '企业名录',
+        111: '企业资质',
+        112: '全国工程人员',
+        113: '业主采购'
+    }
+    if original_docchannel in not_extract_dic:
+        return {'docchannel': {'docchannel':'', 'doctype':not_extract_dic[original_docchannel], "original_docchannel_id": str(original_docchannel)}}
     if web_source_no in ['02104-7']:
       return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}}
 
@@ -2321,7 +2665,7 @@ class DocChannel():
     data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
     text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
     title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
-    result = {'docchannel': {'docchannel':'', 'doctype':''}}
+    result = {'docchannel': {'docchannel':'', 'doctype':'', "original_docchannel_id": str(original_docchannel)}}
 
     array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
     array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
@@ -2350,7 +2694,7 @@ class DocChannel():
       id = np.argmax(pred, axis=1)[0]
       prob = pred[0][id]
       result['docchannel']['docchannel'] = self.id2life[id]
-      # print('生命周期:',self.id2life[id], '概率:',prob)
+      # print('生命周期:纯模型预测',self.id2life[id], '概率:',prob)
       # if id == 6:
       if result['docchannel']['docchannel'] == '中标信息':
         if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
@@ -2405,6 +2749,312 @@ class DocChannel():
               log('正则把中标信息修改为空')
       return channel_dic
 
+  def predict_merge(self, title, list_sentence, html, bidway, prem, original_docchannel='', web_source_no=''):
+      '''
+      正则,模型混合预测,返回公告类型及生命周期
+      :param title:  公告标题
+      :param content: 预处理后的返回的句子实体列表 list_sentence
+      :param html: 公告原文 html 内容
+      :param bidway: 招标方式
+      :param prem: 提取的prem 字典
+      :return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
+      '''
+      def cut_single_cn_space(text):
+          new_text = ""
+          for w in text.split():
+              if len(w) == 1 or re.search('^[\u4e00-\u9fa5][::]', w):
+                  new_text += w
+              else:
+                  new_text += ' ' + w
+          return new_text
+
+      def html2text(html):
+          ser = re.search('<div[^<>]*richTextFetch', html)
+          if ser:
+              html = html[:ser.start()]+'##richTextFetch##'
+          text = re.sub('<[^<]*?>', '', html).replace('&nbsp;', ' ')
+          text = re.sub('\s+', ' ', text)
+          text = re.sub('[/|[()()]', '', text)
+          text = cut_single_cn_space(text)
+          return text[:20000]
+
+      def count_diffser(pattern, text):
+          num = 0
+          kw = []
+          for p in pattern.split(';'):
+              if re.search(p, text):
+                  num += 1
+                  kw.append(re.search(p, text).group(0))
+          return num, ';'.join(kw)
+
+      def is_contain_winner(extract_json):
+          if re.search('win_tenderer', extract_json):
+              return True
+          else:
+              return False
+
+      def is_single_source(bidway, title):
+          if re.search('单一来源|单一性采购', title):
+              return True
+          elif bidway == '单一来源':
+              return True
+          else:
+              return False
+
+      def get_type(title, text):
+          if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'],
+                                                                   text):  # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None
+              if re.search(self.title_type_dic['采招数据'], title + text[:50]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
+              return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0)
+          elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
+              if re.search(self.title_type_dic['采招数据'], title + text[:50]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
+              return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
+          elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text):
+              if re.search(self.title_type_dic['采招数据'], title + text[:50]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
+              return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0)
+          elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text):
+              return '采招数据', (
+                          re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group(
+                  0)
+          elif re.search(self.title_type_dic['新闻资讯'], title):
+              if re.search(self.title_type_dic['采招数据'], title + text[:150]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:150]).group(0)
+              return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0)
+          else:
+              return '', '没有公告类型关键词,返回空'
+
+      def get_life(title, text, extract_json="", bidway="",  original_docchannel=''):
+          if re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100]):
+              if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
+                  return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
+                      0)
+              elif re.search(self.title_life_dic['候选人公示'], title):
+                  return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
+              elif re.search(self.title_life_dic['中标信息'], title):
+                  return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
+              elif re.search('终止|废标|流标', title):
+                  return '废标公告', re.search('终止|废标|流标', title).group(0)
+              elif is_single_source(bidway, title):
+                  return '中标信息', 'bidway单一来源'
+              return '采购意向', (
+                          re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100])).group(0)
+          elif re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text):
+              if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
+                  return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
+                      0)
+              elif re.search(self.title_life_dic['候选人公示'], title):
+                  return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
+              elif re.search(self.title_life_dic['中标信息'], title):
+                  return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
+              elif re.search('终止|废标|流标', title):
+                  return '废标公告', re.search('终止|废标|流标', title).group(0)
+              elif is_single_source(extract_json, title):
+                  return '中标信息', 'bidway单一来源'
+              return '招标预告', (re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text)).group(0)
+          elif re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
+              if re.search(self.title_life_dic['废标公告'], title):
+                  return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
+              #         elif re.search('(中标|成交)结果', title[-8:]):
+              #             return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)       
+              return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(0)
+          elif re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or len(
+                  re.findall('(答:|回复:)', text)) >= 2:  # or re.search(self.title_life_dic['招标答疑'], text[:150])
+              if re.search(self.title_life_dic['废标公告'], title):
+                  return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
+              elif re.search('(中标|成交)结果', title[-8:]):
+                  return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
+              return '招标答疑', (
+                          re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or re.search(
+                      '(答:|回复:)', text)).group(0)
+          elif re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150]):
+              return '废标公告', (
+                          re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150])).group(0)
+          elif re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150]):
+              if re.search('候选人|公示期?(已?满|已经?结束)|中标(结果|公告)', text) == None:
+                  return '中标信息', '候选人公示排除,修改为中标信息'
+              return '候选人公示', (
+                          re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150])).group(
+                  0)
+          elif re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'], text[
+                                                                                             :150]):
+              return '合同公告', (re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'],
+                                                                                    text[:150]) or re.search(
+                  self.life_dic['合同公告'], text)).group(0)
+          elif re.search(self.life_dic['合同公告'].replace(';', '|'), text):  # or re.search(self.life_dic['合同公告'], text[:300]):
+              num, kw = count_diffser(self.life_dic['合同公告'], text)
+              if num >= 3:
+                  return '合同公告', kw
+              elif re.search(self.title_life_dic['招标公告'], title[-8:]):
+                  return '招标公告', re.search(self.title_life_dic['招标公告'], title[-8:]).group(0)
+              elif not is_contain_winner(extract_json):
+                  return '', '有合同关键词无中标角色返回空'
+              return '合同公告', re.search(self.life_dic['合同公告'].replace(';', '|'), text).group(0)
+          elif is_single_source(extract_json, title):
+              return '中标信息', '单一来源采购'
+          elif re.search(self.title_life_dic['中标信息'], title):
+              if re.search(self.title_life_dic['资审结果'], title+text[:150]):
+                  return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
+              return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
+          elif re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:]):
+              if re.search(self.title_life_dic['资审结果'], title+text[:150]):
+                  return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
+              # if re.search(self.wrong_win, text):
+              #     return '招标公告', re.search(self.wrong_win, text).group(0)
+              return '中标信息', (
+                          re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:])).group(
+                  0)
+          elif re.search(self.life_dic['中标信息2'], text[:]):
+              if re.search(self.wrong_win, text):
+                  return '招标公告', re.search(self.wrong_win, text).group(0)
+              return '中标信息', re.search(self.life_dic['中标信息2'], text[:]).group(0)
+          elif re.search(self.life_dic['中标信息3'], text[:]) and is_contain_winner(extract_json):
+              if re.search(self.wrong_win, text):
+                  return '招标公告', re.search(self.wrong_win, text).group(0)
+              return '中标信息', re.search(self.life_dic['中标信息3'], text[:]).group(0)
+          elif re.search('公开选取.{,20}机构的公告', title):
+              if re.search('(中标|成交|中选)(中介|服务)?机构(名称)?[::\s]', text):
+                  return '中标信息', '机构选取有中选机构'
+              else:
+                  return '招标公告', '公开选取机构'
+          elif is_contain_winner(extract_json):
+              num, kw = count_diffser(self.life_dic['招标公告'], text)
+              if re.search(self.wrong_win, text):
+                  return '招标公告', re.search(self.wrong_win, text).group(0)
+              elif num >= 2:
+                  return '招标公告', kw
+              elif re.search('##richTextFetch##', text):
+                  return '', '提取到中标人但包含附件返回空'
+              return '中标信息', '提取到中标人'
+          elif re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:]):
+              return '资审结果', (re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:])).group(0)
+          elif re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'), text[:]):
+              if re.search('意向|预告|变更|更正|中标|中选|成交|答疑|废标|流标|终止', title):
+                  return '', '招标正则召回标题有其他类别关键词,返回空'
+              return '招标公告', (re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'),
+                                                                                    text[:])).group(0)
+          else:
+              return '', '未预测到关键词, 返回空'
+
+      not_extract_dic = {
+          104: '招标文件',
+          106: '法律法规',
+          107: '新闻资讯',
+          108: '拟建项目',
+          109: '展会推广',
+          110: '企业名录',
+          111: '企业资质',
+          112: '全国工程人员',
+          113: '业主采购'
+      }
+
+      origin_dic = {51: '公告变更',
+       52: '招标公告',
+       101: '中标信息',
+       102: '招标预告',
+       103: '招标答疑',
+       104: '招标文件',
+       105: '资审结果',
+       106: '法律法规',
+       107: '新闻资讯',
+       108: '拟建项目',
+       109: '展会推广',
+       110: '企业名录',
+       111: '企业资质',
+       112: '全国工程',
+       113: '业主采购',
+       114: '采购意向',
+       115: '拍卖出让',
+       116: '土地矿产',
+       117: '产权交易',
+       118: '废标公告',
+       119: '候选人公示',
+       120: '合同公告'}
+      if original_docchannel in not_extract_dic:
+          return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel]}}
+      if web_source_no in ['02104-7', '04733']: # 这些数据源无法识别
+          return {'docchannel': {'docchannel': '', 'doctype': '采招数据'}}
+
+      title = re.sub('[^\u4e00-\u9fa5]', '', title)
+      if len(title) > 50:
+          title = title[:20] + title[-30:]
+
+      text = html2text(html)
+      prem_json = json.dumps(prem, ensure_ascii=False)
+      result = {'docchannel': {'docchannel': '', 'doctype': ''}}
+
+      doc_type, type_kw = get_type(title, text)
+      doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
+      if doc_type in self.title_type_dic:
+          result['docchannel']['doctype'] = doc_type
+      if doc_life in self.title_life_dic:
+          result['docchannel']['docchannel'] = doc_life
+
+      if doc_type=="" or doc_life=="":
+          list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
+          token_l = [it.tokens for it in list_sentence]
+          tokens = [it for l in token_l for it in l]
+          content = ' '.join(tokens[:500])
+          data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
+                                                          dochtmlcon=content)  # 标题最多取50字
+          text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
+          title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
+
+          array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
+          array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
+
+          if doc_type == "":
+              pred = self.type_sess.run(self.type_softmax,
+                                        feed_dict={
+                                            self.type_title: array_title,
+                                            self.type_content: array_content,
+                                            self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
+                                            self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
+                                            self.type_prob: 1}
+                                        )
+              id = np.argmax(pred, axis=1)[0]
+              prob = pred[0][id]
+              result['docchannel']['doctype'] = self.id2type[id]
+              # print('公告类别:', self.id2type[id], '概率:',prob)
+              # if id == 0:
+          if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
+              if len(text)>150 and re.search(self.kws, content):
+                  pred = self.lift_sess.run(self.lift_softmax,
+                                            feed_dict={
+                                                self.lift_title: array_title,
+                                                self.lift_content: array_content,
+                                                self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
+                                                self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
+                                                self.lift_prob: 1}
+                                            )
+                  id = np.argmax(pred, axis=1)[0]
+                  prob = pred[0][id]
+                  if self.id2life[id] == '中标信息' and original_docchannel in [52, '52', '招标公告'] and not is_contain_winner(prem_json):
+                      result['docchannel']['docchannel'] = '招标公告'
+                  elif self.id2life[id] == '采购意向' and re.search('意向品牌|意向单位', text):
+                      result['docchannel']['docchannel'] = '招标公告'
+                  else:
+                      result['docchannel']['docchannel'] = self.id2life[id]
+                      # print('生命周期:',self.id2life[id], '概率:',prob)
+                      # if id == 6:
+                      if result['docchannel']['docchannel'] == '中标信息':
+                          if self.is_houxuan(''.join([it for it in title if it.isalpha()]),
+                                             ''.join([it for it in content if it.isalpha()])):
+                              result['docchannel']['docchannel'] = '候选人公示'
+                              # return '候选人公示', prob
+                              # return [{'docchannel': '候选人公示'}]
+      # print('公告类型:%s, 生命周期:%s, 关键词:%s '%(doc_type, doc_life, life_kw))
+      # print('result: ', result)
+      if result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(self.title_life_dic['废标公告'], title)==None:
+          result['docchannel']['docchannel'] = '中标信息'
+      if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
+          result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
+      else:
+          result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
+      return result
+
 # 保证金支付方式提取
 class DepositPaymentWay():
     def __init__(self,):

+ 1 - 1
BiddingKG/dl/ratio/re_ratio.py

@@ -1,7 +1,7 @@
 import re
 
 # ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
-ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)费率|折扣率)([((]?%[))]?|)[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
+ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
 # ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)费率|折扣率)([((]?%[))]?|)[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)')
 
 # 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%,

+ 1 - 1
BiddingKG/dl/table_head/models/model.py

@@ -2,7 +2,7 @@ import sys
 import os
 import numpy as np
 from keras.layers import Lambda, Dense, Reshape, Bidirectional, LSTM, Conv2D, BatchNormalization, LeakyReLU, Masking
-from keras_preprocessing.sequence import pad_sequences
+from keras.preprocessing.sequence import pad_sequences
 sys.path.append(os.path.dirname(__file__))
 
 from models.layer_utils import BatchReshape1, BatchReshape2, MyPadding, MySplit, BatchReshape3, \

+ 1 - 1
BiddingKG/dl/table_head/pre_process.py

@@ -1,7 +1,6 @@
 import os
 import random
 import sys
-import psycopg2
 import numpy as np
 sys.path.append(os.path.dirname(__file__) + "/../")
 from common.Utils import embedding_word, embedding_word_forward
@@ -26,6 +25,7 @@ def get_sentence_index_list(sentence, dict_path='utils/ppocr_keys_v1.txt'):
 
 
 def postgresql_util(sql, limit):
+    import psycopg2
     conn = psycopg2.connect(dbname="table_head_label", user="postgres", password="postgres",
                             host="192.168.2.103")
     cursor = conn.cursor()

+ 1 - 27
BiddingKG/dl/test/12.py

@@ -1,30 +1,4 @@
-import time
-
-import math
-def getAvgD(aint_dis):
-    if len(aint_dis)==0:
-        return 0
-    avg_dis = 1
-    int_avgD = int(sum(aint_dis)/len(aint_dis))
-    new_aint_dis = [a for a in aint_dis]
-    print(sum(aint_dis)/len(aint_dis))
-    min_pow = 10000000
-    min_dis = min(aint_dis)
-
-    for _dis in range(min(aint_dis),max(aint_dis)+1):
-
-        pow_x = 0
-        for _d in new_aint_dis:
-            pow_x += math.sqrt(abs((_d-_dis)))
-        print(_dis,pow_x)
-        if pow_x<min_pow:
-            min_pow = pow_x
-            min_dis = _dis
-
-    return min_dis
-
-import re
-print(re.search('"data": \{"\d+": \[\]\}',"2"))
+print("243705217")
 
 
 

+ 72 - 38
BiddingKG/dl/test/test4.py

@@ -14,70 +14,104 @@ import codecs
 import requests
 import time
 
-_time1 = time.time()
-sys.path.append(os.path.abspath("../.."))
-import fool
-from BiddingKG.dl.common.Utils import *
-from BiddingKG.dl.interface.extract import predict
+import logging
 import json
+global json,logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
-def test(name,content):
+import json
+import random
+
+session = requests.Session()
+
+def test(name,content,_url=None):
+    # _times = 2
+    # _content = ""
+    # for _ in range(_times):
+    #     _content += content
+    # content = _content
+    print(len(content))
     user = {
             "content": content,
             "doc_id":name,
-            "timeout":60
+            "timeout":2000,
+            "original_docchannel":101
             }
+    # print(user)
     myheaders = {'Content-Type': 'application/json',"Authorization":"NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg=="}
 
+    list_url = ["http://127.0.0.1:15030/content_extract",
+                "http://127.0.0.1:15031/content_extract",
+                "http://127.0.0.1:15032/content_extract",
+                "http://127.0.0.1:15033/content_extract",
+                "http://127.0.0.1:15034/content_extract",
+                "http://127.0.0.1:15035/content_extract",
+                "http://127.0.0.1:15036/content_extract",
+                "http://127.0.0.1:15037/content_extract",
+                ]
+    # _i = random.randint(0,len(list_url)-1)
+    # _resp = requests.post(list_url[_i], json=user, headers=myheaders, verify=True)
+
     # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
-    _url = "http://192.168.2.102:15030/article_extract"
-    _resp = requests.post(_url, json=user, headers=myheaders, verify=True)
+    _url = "http://192.168.2.102:15030/test"
+    _url = "http://192.168.2.102:15030/industry_extract"
+    _url = "http://192.168.2.102:15030/content_extract"
+
+    _resp = session.post(_url, json=user,verify=True,timeout=1000)
     # _resp = requests.post("http://192.168.2.102:15000" + '/article_extract', json=user, headers=myheaders, verify=True)
     resp_json = _resp.content.decode("utf-8")
-    print("===",json.loads(resp_json))
+    logging.info("%d===%s"%(_resp.status_code,resp_json[:100]))
 
-    print("====",json.dumps(json.loads(resp_json)))
-    print(resp_json)
     return resp_json
 
+def presure_test():
 
+    from BiddingKG.dl.common.multiThread import MultiThreadHandler
+    from queue import Queue
+    text = codecs.open("2.html","r",encoding="utf8").read()
+    content = str(BeautifulSoup(text).find("div",id="pcontent"))
 
 
-if __name__=="__main__":
+    start_time = time.time()
+    task_queue = Queue()
+    for i in range(300):
+        task_queue.put(content)
+    def _handle(item,result_queue):
+        test("",item)
+    mt = MultiThreadHandler(task_queue,_handle,None,3)
+    mt.run()
+    end_time = time.time()
+    print("all takes :%ds"%(end_time-start_time))
+
+def runlocal(content):
+    import sys
+    import os
+    sys.path.append(os.path.abspath("../.."))
+    import fool
+    from BiddingKG.dl.interface.extract import predict
+
+    predict("12", content,"打印机",original_docchannel=101)
+
+def run_one():
+    from BiddingKG.dl.interface.extract import predict
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
     text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
+    # text = codecs.open("2.html","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
-    # df_a = {"html":[]}
-    # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
-    # import pandas as pd
-    # df = pd.DataFrame(df_a)
-    # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
-    # print()
-    #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
-    # text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
-    # text = 'a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。'
-    # text = '张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,延时规则:在剩余数量小于最小购买数量时,竞价进'
-    # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况,
-    # 中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名:
-    # 哈尔滨昊龙电气没备制造有限公司,投标报价:19.87万元,质,量:合格,工期/交货期/服务期:30天,'''
-    # text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
-    # 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
-    # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
-    # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
     a = time.time()
     # text = '''
     # 购安装工程二标段,第一中标候选人,投标人名称,南阳市宝琛装饰工程有限责任公司,投标报价:147892
     # '''
     print("start")
-    # content = '''
-    # 广州比地数据科技有限公司翻译服务工程招标
-    # '''
-    # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
-    # print(predict("12", content,"打印机"))
-    # content = codecs.open("D:\\Project\\format_conversion_maxcompute\\result.html", "r",encoding="utf8").read()
-    print(predict("12", content,"打印机"))
+    _time1 = time.time()
+    print(predict("12", text,"打印机",original_docchannel=52))
     # test(12,content)
     # test(12,text)
-    print("takes",time.time()-_time1)
+    print("takes",time.time()-a)
     pass
+
+if __name__=="__main__":
+    # presure_test()
+    # run_one()
+    print(json.loads('[{"fileMd5":"9e3dd6501efaa453a95f2d33e9c48913"}]'))

+ 6 - 3
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -92,7 +92,8 @@ def predict(doc_id,text):
     #             print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
     #             pass
     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys, codeName)
+    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_sentences,list_entitys, codeName)
+    predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
     # print("epcPredict")
     epcPredict.predict(list_sentences,list_entitys)
 
@@ -143,11 +144,13 @@ def predict(doc_id,text):
                 if entity.pointer_person:
                     print("公司->联系人1:",end=' ')
                     print(entity.entity_text,[i.entity_text for i in entity.pointer_person],entity.label,entity.values)
+                    # print(_sentence.tokens[entity.begin_index:entity.end_index+3])
                     # print(entity.entity_text,entity.label,entity.values)
                     # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
                 else:
                     print("公司->联系人2:", end=' ')
                     print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
+                    print(_sentence.tokens[entity.begin_index:entity.end_index+3])
                     # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
                     pass
                 if entity.label in [2,3,4]:
@@ -166,7 +169,7 @@ def predict(doc_id,text):
             #     if entity.pointer_pack:
             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
             # elif entity.entity_type =='money':
-            #     print('money',entity.entity_text,entity.label)
+            #     print('money',entity.entity_text,entity.label,entity.money_unit,entity.notes)
             # elif entity.entity_type =='phone':
             #     print('phone',entity.entity_text)
             # elif entity.entity_type =='name':
@@ -177,7 +180,7 @@ def predict(doc_id,text):
 
     #print(prem)
     # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
-    return json.dumps(prem[0],cls=MyEncoder,sort_keys=True,indent=2,ensure_ascii=False)
+    return json.dumps(prem[0],cls=MyEncoder,sort_keys=True,indent=1,ensure_ascii=False)
 
          
 # def test(name,content):

+ 9 - 7
BiddingKG/extract.app.json

@@ -1,18 +1,20 @@
 {
   "generate_token": "true",
   "metadata": {
-    "cpu": 7,
-    "instance": 4,
-    "memory": 18000,
+    "cpu": 4,
+    "instance": 7,
+    "memory": 11000,
     "region": "cn-hangzhou",
     "resource": "eas-r-9oq7xupatg8yoiyuvk",
     "rpc": {
-      "batching": "true",
-      "keepalive": 60000,
-      "max_batch_size": 40
+      "batching": "false",
+      "keepalive": 180000,
+      "max_queue_size": 100,
+      "io_threads": 4,
+      "worker_threads": 5
     }
   },
-  "workers":7,
+  "workers":5,
   "name": "content_extract",
   "processor_entry": "./BiddingKG/app.py",
   "processor_path": "oss://eas-model-hangzhou/1255640119316927/BiddingKG_eas.zip",

+ 2917 - 0
BiddingKG/getAttributes.py

@@ -0,0 +1,2917 @@
+
+
+from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL
+from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
+from decimal import Decimal
+import re
+import copy
+import math
+import pandas as pd
+import os
+from scipy.optimize import linear_sum_assignment
+from BiddingKG.dl.interface.Entitys import Match
+import numpy as np
+
+def getTheRole(entity,role_list):
+    '''
+    @summary:根据实体名称拿到index
+    @param:
+        entity:实体名称
+        role_list:角色list
+    @return:该实体所在下标
+    '''
+    for role_index in range(len(role_list)):
+        if entity in role_list[role_index]:
+            return role_index
+    return None
+
+dict_role_id = {"0":"tenderee",
+                "1":"agency",
+                "2":"win_tenderer",
+                "3":"second_tenderer",
+                "4":"third_tenderer"}
+
+def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
+    '''
+    @param:
+        packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
+        sentence_index:实体所在的句子
+        begin_index:实体所在句子的起始位置
+    @return:公司实体所属的包
+    @summary: 优化多标段,确定标段作用域之后,寻找作用域包含该实体的所有包,从前往后找到一个还没有该roleid的包返回,若找到的包都有roleid,则返回第一个,若没有找到包,返回None
+    '''
+    
+    '''
+    if len(packageList)==0:
+        return None
+    before_index = None
+    after_index = None
+    equal_index = None
+    equal_count = 0
+    
+    
+    for pack_index in range(len(packageList)):
+        if packageList[pack_index][1]>sentence_index and after_index is None:
+            after_index = pack_index
+        if packageList[pack_index][1]<sentence_index:
+            before_index = pack_index
+        if packageList[pack_index][1]==sentence_index and equal_index is None:
+            equal_index = pack_index
+    #当前句子和之前句子未找到包
+    if before_index is None and equal_index is None:
+        return None
+    else:
+        if after_index is None:
+            end_index = len(packageList)
+        else:
+            end_index = after_index
+        #只在当前句子找到一个包号
+        if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
+            return packageList[end_index-1][0]
+        else:
+            for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
+                if packageList[i][2]>int(begin_index):
+                    if packageList[i-1][4]:
+                        return packageList[i-1][0]
+                    else:
+                        if packageList[i][4]:
+                            return packageList[i-1][0]
+                        else:
+                            return packageList[i][0]
+            return packageList[end_index-1][0]
+    '''
+    if len(packageList)==0:
+        return None,False
+    list_legalPack = []
+    for pack_index in range(len(packageList)):
+        if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
+            continue
+        if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
+            continue
+        if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index))  and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
+            if MAX_DIS is not None:
+                if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
+                    list_legalPack.append(pack_index)
+            else:
+                list_legalPack.append(pack_index)
+    # if (packageList[pack_index]["scope"][0][0] < sentence_index
+    #         or (packageList[pack_index]["scope"][0][0] == sentence_index
+    #      and packageList[pack_index]["scope"][0][1] <= begin_index))
+    #         and (packageList[pack_index]["scope"][1][0] > sentence_index
+    #      or (packageList[pack_index]["scope"][1][0] == sentence_index
+    #         and packageList[pack_index]["scope"][1][1] >= begin_index)):
+    #     pass
+    _flag = True
+    for _index in list_legalPack:
+        if roleid in packageList[_index]["hit"]:
+            continue
+        else:
+            _flag = False
+            packageList[_index]["hit"].add(roleid)
+            return packageList[_index]["pointer"],_flag
+    if len(list_legalPack)>0:
+        return packageList[0]["pointer"],_flag
+    return None,False
+
+#生成合法的组合
+def get_legal_comba(list_entity,dict_role_combination):
+    
+    #拿到一个包中所有合法的组合
+    def circle_package(_dict_legal_combination):
+        list_dict_role_first = []
+        for _role in _dict_legal_combination:
+            if len(list_dict_role_first)==0:
+                for _entity in _dict_legal_combination[_role]:
+                    if _entity !="":
+                        list_dict_role_first.append({_role:_entity})
+            else:
+                list_dict_role_after = []
+                _find_count = 0
+                for _entity in _dict_legal_combination[_role]:
+                    if _entity !="":
+                        for _dict in list_dict_role_first:
+                            _flag = True
+                            for _key1 in _dict:
+                                if _entity==_dict[_key1]:
+                                    #修改为招标人和代理人可以为同一个
+                                    if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
+                                        _flag = True
+                                    else:
+                                        _flag = False
+                            if _flag:
+                                _find_count += 1
+                                _new_dict = copy.copy(_dict)
+                                _new_dict[_role] = _entity
+                                if len(list_dict_role_after)>100000:
+                                    break
+                                list_dict_role_after.append(_new_dict)
+                            else:
+                                # 2021/5/25 update,同一实体(entity_text)不同角色
+                                if len(list_dict_role_after) > 100000:
+                                    break
+                                for _dict in list_dict_role_first:
+                                    for _key1 in _dict:
+                                        if _entity == _dict[_key1]:
+                                            _new_dict = copy.copy(_dict)
+                                            _new_dict.pop(_key1)
+                                            _new_dict[_role] = _entity
+                                            list_dict_role_after.append({_role:_entity})
+                if len(list_dict_role_after)==0:
+                    pass
+                else:
+                    list_dict_role_first.extend(list_dict_role_after)
+
+        return list_dict_role_first
+
+
+
+    def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
+        last_layer = False
+        #若是空组合则放回空
+        if len(_dict_legal_combination.keys())==0:
+            return []
+        #递归到最后一层则修改状态
+        if len(_dict_legal_combination.keys())==1:
+            last_layer = True
+        #取一个角色开始进行遍历
+        _key_role = list(_dict_legal_combination.keys())[0]
+        for item in _dict_legal_combination[_key_role]:
+            copy_dict_one_selution = copy.copy(dict_one_selution)
+            copy_dict_legal_combination = {}
+            copy_set_legal_entity = copy.copy(set_legal_entity)
+            
+            #复制余下的所有角色,进行下一轮递归
+            for _key in _dict_legal_combination.keys():
+                if _key!=_key_role:
+                    copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
+
+            #修改为招标人和代理人可以为同一个
+            if item !="":
+                _flag = True
+                if str(_key_role) in ["0","1"]:
+                    for _key_flag in copy_dict_one_selution:
+                        if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
+                            _flag = False
+                else:
+                    for _key_flag in copy_dict_one_selution:
+                        if copy_dict_one_selution[_key_flag]==item:
+                            _flag = False
+                if _flag:
+                    copy_dict_one_selution[_key_role] = item
+                    
+            '''
+            if item not in copy_set_legal_entity:
+                if item !="":
+                    copy_dict_one_selution[_key_role] = item
+            '''
+            copy_set_legal_entity.add(item)
+            if last_layer:
+                list_all_selution.append(copy_dict_one_selution)
+            else:
+                recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
+    
+
+    #递归匹配各个包的结果        
+    def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
+        last_layer = False
+        if len(_dict_legal_combination.keys())==0:
+            return []
+        if len(_dict_legal_combination.keys())==1:
+            last_layer = True
+        _key_pack = list(_dict_legal_combination.keys())[0]
+        for item in _dict_legal_combination[_key_pack]:
+            copy_dict_one_selution = copy.copy(dict_one_selution)
+            copy_dict_legal_combination = {}
+            for _key in _dict_legal_combination.keys():
+                if _key!=_key_pack:
+                    copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
+            for _key_role in item.keys():
+                copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
+            if last_layer:
+                list_all_selution.append(copy_dict_one_selution)
+            else:
+                recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
+        return list_all_selution
+    
+    #循环获取所有包组合
+    def circle_pageages(_dict_legal_combination):
+        list_all_selution = []
+        for _key_pack in _dict_legal_combination.keys():
+            list_key_selution = []
+            for item in _dict_legal_combination[_key_pack]:
+                _dict = dict()
+                for _key_role in item.keys():
+                    _dict[_key_pack+"$$"+_key_role] = item[_key_role]
+                list_key_selution.append(_dict)
+            if len(list_all_selution)==0:
+                list_all_selution = list_key_selution
+            else:
+                _list_all_selution = []
+                for item_1 in list_all_selution:
+                    for item_2 in list_key_selution:
+                        _list_all_selution.append(dict(item_1,**item_2))
+                list_all_selution = _list_all_selution
+        return list_all_selution
+                      
+    #拿到各个包解析之后的结果
+    _dict_legal_combination = {}
+    for packageName in dict_role_combination.keys():
+        _list_all_selution = []
+
+        # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
+        _list_all_selution = circle_package(dict_role_combination[packageName])
+        '''
+        # print("===1")
+        # print(packageName)
+        for item in _list_all_selution:
+            # print(item)
+        # print("===2")
+        '''
+        #去除包含子集
+        list_all_selution_simple = []
+        _list_set_all_selution = []
+        for item_selution in _list_all_selution:
+            item_set_selution = set()
+            for _key in item_selution.keys():
+                item_set_selution.add((_key,item_selution[_key]))
+            _list_set_all_selution.append(item_set_selution)
+        if len(_list_set_all_selution)>1000:
+            _dict_legal_combination[packageName] = _list_all_selution
+            continue
+        for i in range(len(_list_set_all_selution)):
+            
+            be_included = False
+            for j in range(len(_list_set_all_selution)):
+                if i!=j:
+                    if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
+                        be_included = True
+            if not be_included:
+                list_all_selution_simple.append(_list_all_selution[i])
+        _dict_legal_combination[packageName] = list_all_selution_simple
+    _list_final_comba = []
+    #对各个包的结果进行排列组合
+    _comba_count = 1
+    for _key in _dict_legal_combination.keys():
+        _comba_count *= len(_dict_legal_combination[_key])
+    #如果过大,则每个包只取概率最大的那个
+    dict_pack_entity_prob = get_dict_entity_prob(list_entity)
+    if _comba_count>250:
+        new_dict_legal_combination = dict()
+        for _key_pack in _dict_legal_combination.keys():
+            MAX_PROB = -1000
+            _MAX_PROB_COMBA = None
+            for item in _dict_legal_combination[_key_pack]:
+                # print(_key_pack,item)
+                _dict = dict()
+                for _key in item.keys():
+                    _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
+                _prob = getSumExpectation(dict_pack_entity_prob, _dict)
+                if _prob>MAX_PROB:
+                    MAX_PROB = _prob
+                    _MAX_PROB_COMBA = [item]
+            if _MAX_PROB_COMBA is not None:
+                new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
+        _dict_legal_combination = new_dict_legal_combination
+    #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
+    _list_final_comba = circle_pageages(_dict_legal_combination)
+    #除了Project包(招标人和代理人),其他包是不会有冲突的
+    #查看是否有一个实体出现在了Project包和其他包中,如有,要进行裁剪
+    _list_real_comba = []
+    for dict_item in _list_final_comba:
+        set_project = set()
+        set_other = set()
+        for _key in list(dict_item.keys()):
+            if _key.split("$$")[0]=="Project":
+                set_project.add(dict_item[_key])
+            else:
+                set_other.add(dict_item[_key])
+        set_common = set_project&set_other
+        if len(set_common)>0:
+            dict_project = {}
+            dict_not_project = {}
+            for _key in list(dict_item.keys()):
+                if dict_item[_key] in set_common:
+                    if str(_key.split("$$")[0])=="Project":
+                        dict_project[_key] = dict_item[_key]
+                    else:
+                        dict_not_project[_key] = dict_item[_key]
+                else:
+                    dict_project[_key] = dict_item[_key]
+                    dict_not_project[_key] = dict_item[_key]
+            
+            _list_real_comba.append(dict_project)
+            _list_real_comba.append(dict_not_project)
+        else:
+            _list_real_comba.append(dict_item)
+
+    return _list_real_comba
+
+def get_dict_entity_prob(list_entity,on_value=0.5):
+    dict_pack_entity_prob = {}
+    for in_attachment in [False,True]:
+        identified_role = []
+        if in_attachment==True:
+            identified_role = [value[0] for value in dict_pack_entity_prob.values()]
+        for entity in list_entity:
+            if entity.entity_type in ['org','company'] and entity.in_attachment==in_attachment:
+                values = entity.values
+                role_prob = float(values[int(entity.label)])
+                _key = entity.packageName+"$$"+str(entity.label)
+                if role_prob>=on_value and str(entity.label)!="5":
+                    _key_prob = _key+"$text$"+entity.entity_text
+                    if in_attachment == True:
+                        if entity.entity_text in identified_role:
+                            continue
+                    if _key_prob in dict_pack_entity_prob:
+                        if role_prob>dict_pack_entity_prob[_key_prob][1]:
+                            dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
+                    else:
+                        dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
+    return dict_pack_entity_prob
+
+
+#计算合计期望            
+def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
+    '''
+    expect = 0
+    for entity in list_entity:
+        if entity.entity_type in ['org','company']:
+            values = entity.values
+            role_prob = float(values[int(entity.label)])
+            _key = entity.packageName+"$$"+str(entity.label)
+            if role_prob>on_value and str(entity.label)!="5":
+                if _key in combination.keys() and combination[_key]==entity.entity_text:
+                    expect += math.pow(role_prob,4)
+                else:
+                    expect -= math.pow(role_prob,4)
+    '''
+    #修改为同一个实体只取对应包-角色的最大的概率值
+    expect = 0
+    dict_entity_prob = {}
+    for _key_pack_entity in dict_pack_entity_prob:
+        _key_pack = _key_pack_entity.split("$text$")[0]
+        role_prob = dict_pack_entity_prob[_key_pack_entity][1]
+        if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
+            if _key_pack_entity in dict_entity_prob.keys():
+                if dict_entity_prob[_key_pack_entity]<role_prob:
+                    dict_entity_prob[_key_pack_entity] = role_prob
+            else:
+                dict_entity_prob[_key_pack_entity] = role_prob
+        else:
+            if _key_pack_entity in dict_entity_prob.keys():
+                if dict_entity_prob[_key_pack_entity]>-role_prob:
+                    dict_entity_prob[_key_pack_entity] = -role_prob
+            else:
+                dict_entity_prob[_key_pack_entity] = -role_prob
+    # for entity in list_entity:
+    #     if entity.entity_type in ['org','company']:
+    #         values = entity.values
+    #         role_prob = float(values[int(entity.label)])
+    #         _key = entity.packageName+"$$"+str(entity.label)
+    #         if role_prob>=on_value and str(entity.label)!="5":
+    #             if _key in combination.keys() and combination[_key]==entity.entity_text:
+    #                 _key_prob = _key+entity.entity_text
+    #                 if _key_prob in dict_entity_prob.keys():
+    #                     if dict_entity_prob[_key_prob]<role_prob:
+    #                         dict_entity_prob[_key_prob] = role_prob
+    #                 else:
+    #                     dict_entity_prob[_key_prob] = role_prob
+    #             else:
+    #                 _key_prob = _key+entity.entity_text
+    #                 if _key_prob in dict_entity_prob.keys():
+    #                     if dict_entity_prob[_key_prob]>-role_prob:
+    #                         dict_entity_prob[_key_prob] = -role_prob
+    #                 else:
+    #                     dict_entity_prob[_key_prob] = -role_prob
+    for _key in dict_entity_prob.keys():
+        symbol = 1 if dict_entity_prob[_key]>0 else -1 
+        expect += symbol*math.pow(dict_entity_prob[_key],2)
+    return expect
+
+
+def getRoleList(list_sentence,list_entity,on_value = 0.5):
+    '''
+    @summary: 搜索树,得到所有不矛盾的角色组合,取合计期望值最大的作为结果返回
+    @param:
+        list_sentence:文章所有的sentence
+        list_entity:文章所有的实体
+        on_value:概率阈值
+    @return:文章的角色list
+    '''
+
+    pack = getPackagesFromArticle(list_sentence,list_entity)
+    if pack is None:
+        return None
+    PackageList,PackageSet,dict_PackageCode = pack
+
+    #拿到所有可能的情况
+    dict_role_combination = {}
+  # print(PackageList)
+    #拿到各个实体的packageName,packageCode
+    for entity in list_entity:
+        if entity.entity_type in ['org','company']:
+            #限制附件里角色values[label]最大概率prob
+            max_prob = 0.85
+            if str(entity.label)!="5" and entity.in_attachment:
+                if entity.values[entity.label]>max_prob:
+                    entity.values[entity.label] = max_prob
+            #过滤掉字数小于3个的实体
+            if len(entity.entity_text)<=3:
+                continue
+            values = entity.values
+            role_prob = float(values[int(entity.label)])
+            if role_prob>=on_value and str(entity.label)!="5":
+                if str(entity.label) in ["0","1"]:
+                    packageName = "Project"
+                else:
+                    if len(PackageSet)>0:
+                        packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"role-"+str(entity.label))
+                        if packagePointer is None:
+                            #continue
+                            packageName = "Project"
+                          # print(entity.entity_text, packageName,entity.sentence_index,entity.begin_index)
+                        else:
+                            #add pointer_pack
+                            entity.pointer_pack = packagePointer
+                            packageName = packagePointer.entity_text
+                          # print(entity.entity_text, packageName)
+                    else:
+                        packageName = "Project"
+                    find_flag = False
+
+                    if packageName in dict_PackageCode.keys():
+                        packageCode = dict_PackageCode[packageName]
+                    else:
+                        packageCode = ""
+                    entity.packageCode = packageCode
+                role_name = dict_role_id.get(str(entity.label))
+                entity.roleName = role_name
+                entity.packageName = packageName
+                if entity.packageName in dict_role_combination.keys():
+                    if str(entity.label) in dict_role_combination[entity.packageName].keys():
+                        dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
+                    else:
+                        dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
+                else:
+                    dict_role_combination[entity.packageName] = {}
+                    #初始化空值
+                    roleIds = [0,1,2,3,4]
+                    for _roleId in roleIds:
+                        dict_role_combination[entity.packageName][str(_roleId)] = set([""])
+                    dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
+    list_real_comba = get_legal_comba(list_entity,dict_role_combination)
+  # print("===role_combination",dict_role_combination)
+  # print("== real_comba",list_real_comba)
+    #拿到最大期望值的组合
+    max_index = 0
+    max_expect = -100
+    _index = 0
+    dict_pack_entity_prob = get_dict_entity_prob(list_entity)
+    for item_combination in list_real_comba:
+        expect = getSumExpectation(dict_pack_entity_prob, item_combination)
+        if expect>max_expect:
+            max_index = _index
+            max_expect = expect
+        _index += 1
+    RoleList = []
+    RoleSet = set()
+    if len(list_real_comba)>0:
+        for _key in list_real_comba[max_index].keys():
+            packageName = _key.split("$$")[0]
+            label = _key.split("$$")[1]
+            role_name = dict_role_id.get(str(label))
+            entity_text = list_real_comba[max_index][_key]
+            if packageName in dict_PackageCode.keys():
+                packagecode = dict_PackageCode.get(packageName)
+            else:
+                packagecode = ""
+            RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
+            RoleSet.add(entity_text)
+
+    #根据最优树来修正list_entity中角色对包的连接
+    for _entity in list_entity:
+        if _entity.pointer_pack is not None:
+            _pack_name = _entity.pointer_pack.entity_text
+            _find_flag = False
+            for _prem in RoleList:
+                if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
+                    _find_flag = True
+            if not _find_flag:
+                _entity.pointer_pack = None
+    return RoleList,RoleSet,PackageList,PackageSet
+
+def getPackageScopePattern():
+    '''
+    @summary: 获取包的作用域关键词
+    '''
+    df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
+    pattern = "("
+    for item in df["list_word"]:
+        item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
+        pattern += item+"|"
+    pattern = pattern[:-1]+")[::是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
+    return pattern
+        
+pattern_packageScope = getPackageScopePattern()   
+def getPackagesFromArticle(list_sentence,list_entity):
+    '''
+    @param:
+        list_sentence:文章的句子list
+    @summary: 将包的信息插入list_entity中
+    @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
+    '''
+    
+    if len(list_sentence)==0:
+        return None
+    list_sentence.sort(key=lambda x:x.sentence_index)
+
+    PackageList = []
+    PackageList_scope = []
+    PackageSet = set()
+    dict_packageCode = dict()
+    
+    package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
+    package_N_name_pattern = re.compile("(([^承]|^)分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
+    package_number_pattern = re.compile("(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")  # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
+    # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)')  # 新正则识别标段
+    other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]')  #  # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
+    win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]')  # 2020/11/23 大网站规则 调整
+    model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]')  # 2020/11/23 大网站规则 调整
+    number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
+
+    package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
+    # 纯数字类型的包号统一,例如:'01','1'
+    re_digital = re.compile("^\d+$")
+    def changeIndexFromWordToWords(tokens,word_index):
+        '''
+        @summary:转换某个字的字偏移为词偏移
+        '''
+        before_index = 0
+        after_index = 0
+        for i in range(len(tokens)):
+            after_index = after_index+len(tokens[i])
+            if before_index<=word_index and after_index>=word_index:
+                return i
+            before_index = after_index
+    package_names = []
+    
+    def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
+        '''
+        @summary:抽取包附近的标段号
+        @param:
+            tokens:包所在句子的分词
+            word_index:包所在字偏移
+            size:左右各取多少个词
+            pattern:提取标段号的正则
+        @return: type:string,meaning:标段号
+        '''
+        index = changeIndexFromWordToWords(tokens,word_index)
+        if index<size:
+            begin = index
+        else:
+            begin = index-size
+        if index+size>len(tokens):
+            end = len(tokens)
+        else:
+            end = index+size
+        #拿到左右两边的词语组成短语
+        text = "".join(tokens[begin:end])
+        #在短语中的字偏移
+        new_word_index = word_index-len("".join(tokens[:begin]))
+        min_distance = len(text)
+        packageCode = None
+        for the_iter in re.finditer(pattern,text):
+            #算出最小距离
+            distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
+            if distance<min_distance:
+                min_distance = distance
+                packageCode = the_iter.group(1)
+        return packageCode
+    #从标段介绍表格中提取包名和包号
+    for i in range(len(list_sentence)):
+        content = list_sentence[i].sentence_text
+        names = re.findall(package_name_pattern,content)
+        if names == []:
+            names = re.findall(other_package_pattern, content)
+        N_names = re.findall(package_N_name_pattern,content)
+        if len(names)==1 and len(N_names)==1:
+            package_names.append([names[0][-1],N_names[0][-1]])
+    for i in range(len(list_sentence)):
+        PackageList_item = []
+        PackageList_item_scope = []
+        content = list_sentence[i].sentence_text
+        tokens = list_sentence[i].tokens
+        _names = []
+        # 2021/6/23 包名称去重
+        for name in package_names:
+            if name not in _names:
+                _names.append(name)
+        # for name in package_names[:20]:
+        for name in _names[:20]:
+            for index in findAllIndex(name[0],content):
+                temp_package_number = re.findall(number_pattern,name[1])[0]
+                if re.search(re_digital,temp_package_number):
+                    temp_package_number = str(int(temp_package_number))
+                PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
+                # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
+                code = extractPackageCode(tokens, index)
+                if code is not None:
+                    dict_packageCode[temp_package_number] = code
+                PackageSet.add(temp_package_number)
+        for iter in re.finditer(package_number_pattern,content):
+            temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
+            if re.search(re_digital, temp_package_number):
+                temp_package_number = str(int(temp_package_number))
+            PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
+            # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
+            code = extractPackageCode(tokens, iter.span()[0])
+            if code is not None:
+                dict_packageCode[temp_package_number] = code
+            PackageSet.add(temp_package_number)
+        
+        #识别packageScope
+        for iter in re.finditer(pattern_packageScope,content):
+            PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
+            # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
+        PackageList_item_scope = PackageList_item +PackageList_item_scope
+        PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
+        PackageList_scope = PackageList_scope+PackageList_item_scope
+        PackageList_item.sort(key=lambda x:x["sentence_index"])
+        #PackageList = PackageList+PackageList_item
+    #不作为包
+    # if len(PackageSet)==0:
+    #     for i in range(len(list_sentence)):
+    #         PackageList_item = []
+    #         PackageList_item_scope = []
+    #         content = list_sentence[i].sentence_text
+    #         tokens = list_sentence[i].tokens
+    #         for iter in re.finditer(other_package_pattern,content):
+    #             temp_package_number = iter.group(2)
+    #             PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
+    #             # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
+    #             code = extractPackageCode(tokens, iter.span()[0])
+    #             if code is not None:
+    #                 dict_packageCode[temp_package_number] = code
+    #             PackageSet.add(temp_package_number)
+    #         #识别packageScope
+    #         for iter in re.finditer(pattern_packageScope,content):
+    #             PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
+    #             # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
+    #         PackageList_item_scope = PackageList_item +PackageList_item_scope
+    #         PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
+    #         PackageList_scope = PackageList_scope+PackageList_item_scope
+    #         PackageList_item.sort(key=lambda x:x["sentence_index"])
+
+    # 2020/11/23 大网站规则 调整
+    if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
+        for i in range(len(list_sentence)):
+            PackageList_item = []
+            PackageList_item_scope = []
+            content = list_sentence[i].sentence_text
+            tokens = list_sentence[i].tokens
+            names = re.findall(other_package_pattern, content)
+            N_names = re.findall(win_tenderer_pattern, content)
+            if len(names) != 1 or len(N_names) != 1:
+                continue
+            for iter in re.finditer(other_package_pattern,content):
+                temp_package_number = iter.group(4)
+                xinghao = re.search(model_pattern, content)
+                if xinghao:
+                    temp_package_number = temp_package_number + '+' + xinghao.group(2)
+                # print('新正则采购包名补充',temp_package_number)
+                if re.search(re_digital,temp_package_number):
+                    temp_package_number = str(int(temp_package_number))
+                PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
+                # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
+                code = extractPackageCode(tokens, iter.span()[0])
+                if code is not None:
+                    dict_packageCode[temp_package_number] = code
+                PackageSet.add(temp_package_number)
+            #识别packageScope
+            for iter in re.finditer(pattern_packageScope,content):
+                PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
+                # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
+            PackageList_item_scope = PackageList_item +PackageList_item_scope
+            PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
+            PackageList_scope = PackageList_scope+PackageList_item_scope
+            PackageList_item.sort(key=lambda x:x["sentence_index"])
+    pattern_punctuation = "[::()\(\),,。;;]"
+  # print("===packageList_scope",PackageList_scope)
+    for i in range(len(list_sentence)):
+        for j in range(len(PackageList_scope)):
+            if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
+                _flag = False
+                left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
+                right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
+                _left_find = re.findall(pattern_punctuation,left_str)
+                _right_find = re.findall(pattern_punctuation,right_str)
+                #print(left_str)
+                if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
+                    continue
+                if re.search("划分",right_str[:10]) is not None:
+                    continue
+                if len(_left_find)>0 and _left_find[-1] in [":",":"]:
+                    _flag = True
+                if len(_right_find)>0 and _right_find[0] in [":",":"]:
+                    _flag = True
+                if _flag:
+                    scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
+                else:
+                    if j==0:
+                        scope_begin = [0,0]
+                    else:
+                        scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
+                if j==len(PackageList_scope)-1:
+                    scope_end = [list_sentence[-1].sentence_index,changeIndexFromWordToWords(list_sentence[-1].tokens, len(list_sentence[-1].sentence_text))]
+                else:
+                    scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
+                if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
+                    continue
+
+                #add package to entity
+                _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"],in_attachment=list_sentence[i].in_attachment)
+                list_entity.append(_pack_entity)
+                copy_pack = copy.copy(PackageList_scope[j])
+                copy_pack["scope"] = [scope_begin,scope_end]
+                copy_pack["hit"] = set()
+                copy_pack["pointer"] = _pack_entity
+
+                PackageList.append(copy_pack)
+    return PackageList,PackageSet,dict_packageCode
+
+# km配对方法
+def dispatch(match_list):
+    main_roles = list(set([match.main_role for match in match_list]))
+    attributes = list(set([match.attribute for match in match_list]))
+
+    label = np.zeros(shape=(len(main_roles), len(attributes)))
+    for match in match_list:
+        main_role = match.main_role
+        attribute = match.attribute
+        value = match.value
+        label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
+    # print(label)
+    gragh = -label
+    # km算法
+    row, col = linear_sum_assignment(gragh)
+    max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
+    # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
+    return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
+
+from BiddingKG.dl.common.Utils import getUnifyMoney
+from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
+relationExtraction_model = Model_relation_extraction()
+def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,on_value = 0.5,on_value_person=0.5,sentence_len=4):
+    '''
+    @param:
+        PackDict:文章包dict
+        roleSet:文章所有角色的公司名称
+        PackageList:文章的包信息
+        PackageSet:文章所有包的名称
+        list_entity:文章所有经过模型处理的实体
+        on_value:金额模型的阈值
+        on_value_person:联系人模型的阈值
+        sentence_len:公司和属性间隔句子的最大长度
+    @return:添加了属性信息的角色list
+    '''
+    
+    #根据roleid添加金额到rolelist中
+    def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
+        for i in range(len(packDict[packageName]["roleList"])):
+            if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
+                if money_prob>packDict[packageName]["roleList"][i].money_prob:
+                    packDict[packageName]["roleList"][i].money = money
+                    packDict[packageName]["roleList"][i].money_prob = money_prob
+        return packDict
+                    
+    #根据实体名称添加金额到rolelist中
+    def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
+        for i in range(len(packDict[packageName]["roleList"])):
+            if packDict[packageName]["roleList"][i].entity_text==entity:
+                # if money_prob>packDict[packageName]["roleList"][i].money_prob:
+                #     packDict[packageName]["roleList"][i].money = money
+                #     packDict[packageName]["roleList"][i].money_prob = money_prob
+                if packDict[packageName]["roleList"][i].money_prob==0 :  # 2021/7/20第一次更新金额
+                    packDict[packageName]["roleList"][i].money = money.entity_text
+                    packDict[packageName]["roleList"][i].money_prob = money_prob
+                    packDict[packageName]["roleList"][i].money_unit = money.money_unit
+                elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or (money.notes in ['大写'] and money.in_attachment==False): # 2021/7/20改为优先选择大写金额,
+                    # print('已连接金额概率:money_prob:',packDict[packageName]["roleList"][i].money_prob)
+                    # print('链接金额备注 ',money.notes, money.entity_text, money.values)
+                    packDict[packageName]["roleList"][i].money = money.entity_text
+                    packDict[packageName]["roleList"][i].money_prob = money_prob
+                    packDict[packageName]["roleList"][i].money_unit = money.money_unit
+                # print('链接中的金额:{0}, 单位:{1}'.format(money.entity_text, money.money_unit))
+        return packDict
+    def addRatioByEntity(packDict,packageName,entity,ratio):
+        for i in range(len(packDict[packageName]["roleList"])):
+            if packDict[packageName]["roleList"][i].entity_text==entity:
+                packDict[packageName]["roleList"][i].ratio = ratio.entity_text
+    def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
+        for i in range(len(packDict[packageName]["roleList"])):
+            if packDict[packageName]["roleList"][i].entity_text==entity:
+                packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text
+
+    #根据实体名称得到角色
+    def getRoleWithText(packDict,entity_text):
+        for pack in packDict.keys():
+            for i in range(len(packDict[pack]["roleList"])):
+                if packDict[pack]["roleList"][i].entity_text==entity_text:
+                    return packDict[pack]["roleList"][i].role_name
+    
+    def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
+        _list_entitys = [entity]+entity.linked_entitys
+        for _entity in _list_entitys:
+            if _entity.entity_text in RoleSet:
+                return True
+    
+    p_entity = 0
+
+    # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000
+    money_list = [it for it in list_entity if it.entity_type=="money"]
+    for i in range(len(money_list)-1):
+        for j in range(1, len(money_list)):
+            if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
+                    Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
+                money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
+                # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
+    
+    #遍历所有实体
+    # while(p_entity<len(list_entity)):
+    #     entity = list_entity[p_entity]
+        '''
+        #招标金额从后往前找
+        if entity.entity_type=="money":
+            if entity.values[entity.label]>=on_value:
+                if str(entity.label)=="0":
+                    packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
+                    if packagePointer is None:
+                        packageName = "Project"
+                    else:
+                        packageName = packagePointer.entity_text
+                    addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
+        '''
+        ''' # 2020/11/25 与下面的联系人连接步骤重复,取消
+        if entity.entity_type=="person":
+            if entity.values[entity.label]>=on_value_person:
+                if str(entity.label)=="1":
+                    for i in range(len(PackDict["Project"]["roleList"])):
+                        if PackDict["Project"]["roleList"][i].role_name=="tenderee":
+                            PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
+                        # add pointer_person
+                        for _entity in list_entity:
+                            if dict_role_id.get(str(_entity.label))=="tenderee":
+                                for i in range(len(PackDict["Project"]["roleList"])):
+                                    if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
+                                        _entity.pointer_person = entity
+                elif str(entity.label)=="2":
+                    for i in range(len(PackDict["Project"]["roleList"])):
+                        if PackDict["Project"]["roleList"][i].role_name=="agency":
+                            PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
+                    # add pointer_person
+                    for _entity in list_entity:
+                        if dict_role_id.get(str(_entity.label))=="agency":
+                            for i in range(len(PackDict["Project"]["roleList"])):
+                                if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
+                                    _entity.pointer_person = entity
+    '''
+        # #金额往前找实体
+        # if entity.entity_type=="money":
+        #     if entity.values[entity.label]>=on_value:
+        #         p_entity_money= p_entity
+        #         entity_money = list_entity[p_entity_money]
+        #         if len(PackageSet)>0:
+        #             packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
+        #             if packagePointer is None:
+        #                 packageName_entity = "Project"
+        #             else:
+        #                 packageName_entity = packagePointer.entity_text
+        #         else:
+        #             packageName_entity = "Project"
+        #         while(p_entity_money>0):
+        #             entity_before = list_entity[p_entity_money]
+        #             if entity_before.entity_type in ['org','company']:
+        #                 if str(entity_before.label)=="1":
+        #                     addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
+        #                     #add pointer_money
+        #                     entity_before.pointer_money = entity_money
+        #                 break
+        #             p_entity_money -= 1
+
+
+
+        #如果实体属于角色集合,则往后找属性
+        # if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
+        #
+        #     p_entity += 1
+        #     #循环查找符合的属性
+        #     while(p_entity<len(list_entity)):
+        #
+        #         entity_after = list_entity[p_entity]
+        #         if entity_after.sentence_index-entity.sentence_index>=sentence_len:
+        #             p_entity -= 1
+        #             break
+        #         #若是遇到公司实体,则跳出循环
+        #         if entity_after.entity_type in ['org','company']:
+        #             p_entity -= 1
+        #             break
+        #         if entity_after.values is not None:
+        #             if entity_after.entity_type=="money":
+        #                 if entity_after.values[entity_after.label]>=on_value:
+        #                     '''
+        #                     #招标金额从后往前找
+        #                     if str(entity_after.label)=="0":
+        #                         packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
+        #                         if packagePointer is None:
+        #                             packageName = "Project"
+        #                         else:
+        #                             packageName = packagePointer.entity_text
+        #                         addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
+        #                     '''
+        #                     if str(entity_after.label)=="1":
+        #                         #print(entity_after.entity_text,entity.entity_text)
+        #                         _list_entitys = [entity]+entity.linked_entitys
+        #                         if len(PackageSet)>0:
+        #                             packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
+        #                             if packagePointer is None:
+        #                                 packageName_entity = "Project"
+        #                             else:
+        #                                 packageName_entity = packagePointer.entity_text
+        #                         else:
+        #                             packageName_entity = "Project"
+        #                         if str(entity.label) in ["2","3","4"]:
+        #                             # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
+        #                             if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
+        #                                 addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
+        #                                                  0.5)
+        #                                 entity.pointer_money = entity_after
+        #                                 # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
+        #                             else:
+        #                                 addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
+        #                                                  entity_after.values[entity_after.label])
+        #                                 entity.pointer_money = entity_after
+        #                                 # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
+        #                                 if entity_after.values[entity_after.label]>0.6:
+        #                                     break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
+        #                             #add pointer_money
+        #                             # entity.pointer_money = entity_after
+        #                             # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
+        #                             # if entity_after.notes!='单价':
+        #                             #     break  # 2021/7/16 新增,找到中标金额即停止,不再往后找金额
+        #                 '''
+        #             if entity_after.entity_type=="person":
+        #                 if entity_after.values[entity_after.label]>=on_value_person:
+        #                     if str(entity_after.label)=="1":
+        #                         for i in range(len(roleList)):
+        #                             if roleList[i].role_name=="tenderee":
+        #                                 roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
+        #                     elif str(entity_after.label)=="2":
+        #                         for i in range(len(roleList)):
+        #                             if roleList[i].role_name=="agency":
+        #                                 roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
+        #                     elif str(entity_after.label)=="3":
+        #                         _list_entitys = [entity]+entity.linked_entitys
+        #                         for _entity in _list_entitys:
+        #                             for i in range(len(roleList)):
+        #                                 if roleList[i].entity_text==_entity.entity_text:
+        #                                     if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
+        #                                         break
+        #                                     roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
+        #             '''
+        #
+        #         p_entity += 1
+        #
+        # p_entity += 1
+    # 记录每句的分词数量
+    tokens_num_dict = dict()
+    last_tokens_num = 0
+    for sentence in list_sentence:
+        _index = sentence.sentence_index
+        if _index == 0:
+            tokens_num_dict[_index] = 0
+        else:
+            tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
+        last_tokens_num = len(sentence.tokens)
+    attribute_type = ['money','serviceTime','ratio']# 'money'仅指“中投标金额”
+    for link_attribute in attribute_type:
+        temp_entity_list = []
+        if link_attribute=="money":
+            temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
+                                (ent.entity_type=='money' and ent.label==1 and ent.values[ent.label]>=0.5)]
+            # 删除重复的‘中投标金额’,一般为大小写两种样式
+            drop_tendererMoney = []
+            for ent_idx in range(len(temp_entity_list)-1):
+                entity = temp_entity_list[ent_idx]
+                if entity.entity_type=='money':
+                    next_entity = temp_entity_list[ent_idx+1]
+                    if next_entity.entity_type=='money':
+                        if getUnifyMoney(entity.entity_text)==getUnifyMoney(next_entity.entity_text):
+                            if (tokens_num_dict[next_entity.sentence_index] + next_entity.begin_index) - (
+                                               tokens_num_dict[entity.sentence_index] + entity.end_index) < 10:
+                                drop_tendererMoney.append(next_entity)
+            for _drop in drop_tendererMoney:
+                temp_entity_list.remove(_drop)
+        elif link_attribute=="serviceTime":
+            temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
+                                ent.entity_type=='serviceTime']
+        elif link_attribute=="ratio":
+            temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
+                                ent.entity_type=='ratio']
+        temp_entity_list = sorted(temp_entity_list,key=lambda x: (x.sentence_index, x.begin_index))
+        temp_match_list = []
+        for ent_idx in range(len(temp_entity_list)):
+            entity = temp_entity_list[ent_idx]
+            if entity.entity_type in ['org','company']:
+                match_nums = 0
+                tenderer_nums = 0 #经过其他中投标人的数量
+                byNotTenderer_match_nums = 0 #跟在中投标人后面的属性
+                for after_index in range(ent_idx + 1, min(len(temp_entity_list), ent_idx + 4)):
+                    after_entity = temp_entity_list[after_index]
+                    if after_entity.entity_type == link_attribute:
+                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
+                        sentence_distance = after_entity.sentence_index - entity.sentence_index
+                        value = (-1 / 2 * (distance ** 2)) / 10000
+                        if link_attribute == "money":
+                            if after_entity.notes == '单价':
+                                value = value * 100
+                        if sentence_distance == 0:
+                            if distance < 100:
+                                # value = (-1 / 2 * (distance ** 2)) / 10000
+                                temp_match_list.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                                if not tenderer_nums:
+                                    byNotTenderer_match_nums += 1
+                                else:
+                                    break
+                        else:
+                            if distance < 60:
+                                # value = (-1 / 2 * (distance ** 2)) / 10000
+                                temp_match_list.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                                if not tenderer_nums:
+                                    byNotTenderer_match_nums += 1
+                                else:
+                                    break
+                    else:
+                        tenderer_nums += 1
+                #前向查找属性
+                if ent_idx!=0 and (not match_nums or not byNotTenderer_match_nums):
+                    previous_entity = temp_entity_list[ent_idx - 1]
+                    if previous_entity.entity_type == link_attribute:
+                        # if previous_entity.sentence_index == entity.sentence_index:
+                        distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
+                                tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index)
+                        if distance < 40:
+                            # 前向 没有 /10000
+                            value = (-1 / 2 * (distance ** 2))
+                            temp_match_list.append(Match(entity, previous_entity, value))
+        # km算法分配求解
+        dispatch_result = dispatch(temp_match_list)
+        dispatch_result = sorted(dispatch_result, key=lambda x: (x[0].sentence_index,x[0].begin_index))
+        for match in dispatch_result:
+            _entity = match[0]
+            _attribute = match[1]
+            if link_attribute=='money':
+                _entity.pointer_money = _attribute
+                packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
+                                               "money-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
+                # print(_entity.entity_text,_attribute.entity_text)
+                if packagePointer is None:
+                    packageName_entity = "Project"
+                else:
+                    packageName_entity = packagePointer.entity_text
+                if _attribute.notes == '单价' or float(_attribute.entity_text) < 5000:  # 2021/12/17 调整小金额阈值,避免203608823.html 两次金额一次万元没提取到的情况
+                    # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label])
+                    addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,0.5)
+                else:
+                    # print(packageName_entity,_attribute.entity_text, _attribute.values[_attribute.label])
+                    addMoneyByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute,
+                                     _attribute.values[_attribute.label])
+            elif link_attribute=='serviceTime':
+                _entity.pointer_serviceTime = _attribute
+                packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
+                                               "serviceTime-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
+                if packagePointer is None:
+                    packageName_entity = "Project"
+                else:
+                    packageName_entity = packagePointer.entity_text
+                addServiceTimeByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
+            elif link_attribute=='ratio':
+                _entity.pointer_ratio = _attribute
+                packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
+                                               "ratio-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
+                if packagePointer is None:
+                    packageName_entity = "Project"
+                else:
+                    packageName_entity = packagePointer.entity_text
+                addRatioByEntity(PackDict, packageName_entity, _entity.entity_text, _attribute)
+    ''''''
+    # 通过模型分类的招标/代理联系人
+    list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
+    person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]]
+    tenderee_contact = set()
+    tenderee_phone = set()
+    agency_contact = set()
+    agency_phone = set()
+    winter_contact = set()
+    for _person in person_list:
+        if _person.label == 1:
+            tenderee_contact.add(_person.entity_text)
+        if _person.label == 2:
+            agency_contact.add(_person.entity_text)
+    # 正则匹配无 '主体/联系人' 的电话
+    # 例:"采购人联系方式:0833-5226788,"
+    phone_pattern = '(1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' \
+                    '\+86.?1[3-9]\d{9}|' \
+                    '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' \
+                    '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|' \
+                    '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|' \
+                    '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|' \
+                   '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|' \
+                   '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' \
+                   '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|' \
+                   '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|' \
+                   '[2-9]\d{6,7})'
+    re_tenderee_phone = re.compile(
+        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
+        # 电话号码
+        + phone_pattern)
+    # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
+    re_tenderee_phone2 = re.compile(
+        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
+        # 电话号码
+        + phone_pattern)
+    re_agent_phone = re.compile(
+        "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
+        # 电话号码
+        + phone_pattern)
+    re_agent_phone2 = re.compile(
+        "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
+        # 电话号码
+        + phone_pattern)
+    content = ""
+    for _sentence in list_sentence:
+        content += "".join(_sentence.tokens)
+    _content = copy.deepcopy(content)
+    while re.search("(.)(,)([^0-9])|([^0-9])(,)(.)", content):
+        content_words = list(content)
+        for i in re.finditer("(.)(,)([^0-9])", content):
+            content_words[i.span(2)[0]] = ""
+        for i in re.finditer("([^0-9])(,)(.)", content):
+            content_words[i.span(2)[0]] = ""
+        content = "".join(content_words)
+    content = re.sub("[::]|[\((]|[\))]", "", content)
+    _tenderee_phone = re.findall(re_tenderee_phone, content)
+    # 更新正则确定的角色属性
+    for i in range(len(PackDict["Project"]["roleList"])):
+        if PackDict["Project"]["roleList"][i].role_name == "tenderee":
+            _tenderee_phone = re.findall(re_tenderee_phone, content)
+            if _tenderee_phone:
+                for _phone in _tenderee_phone:
+                    _phone = _phone.split("/") # 分割多个号码
+                    for one_phone in _phone:
+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
+                        tenderee_phone.add(one_phone)
+            _tenderee_phone2 = re.findall(re_tenderee_phone2, content)
+            if _tenderee_phone2:
+                for _phone in _tenderee_phone2:
+                    _phone = _phone.split("/")
+                    for one_phone in _phone:
+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
+                        tenderee_phone.add(one_phone)
+        if PackDict["Project"]["roleList"][i].role_name == "agency":
+            _agent_phone = re.findall(re_agent_phone, content)
+            if _agent_phone:
+                for _phone in _agent_phone:
+                    _phone = _phone.split("/")
+                    for one_phone in _phone:
+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
+                        agency_phone.add(one_phone)
+            _agent_phone2 = re.findall(re_agent_phone2, content)
+            if _agent_phone2:
+                for _phone in _agent_phone2:
+                    _phone = _phone.split("/")
+                    for one_phone in _phone:
+                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
+                        agency_phone.add(one_phone)
+
+    # 正则提取电话号码实体
+    # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
+    phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
+                       '\+86.?1[3-9]\d{9}|'
+                       # '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
+                       '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
+                       '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
+                       '[2-9]\d{6,7}')
+    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
+    email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
+                            "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
+    phone_entitys = []
+    code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
+    for _sentence in list_sentence:
+        sentence_text = _sentence.sentence_text
+        in_attachment = _sentence.in_attachment
+        list_tokenbegin = []
+        begin = 0
+        for i in range(0, len(_sentence.tokens)):
+            list_tokenbegin.append(begin)
+            begin += len(str(_sentence.tokens[i]))
+        list_tokenbegin.append(begin + 1)
+        # 排除网址、邮箱、项目编号实体
+        error_list = []
+        for i in re.finditer(url_pattern, sentence_text):
+            error_list.append((i.start(), i.end()))
+        for i in re.finditer(email_pattern, sentence_text):
+            error_list.append((i.start(), i.end()))
+        for code_ent in [ent for ent in code_entitys if ent.sentence_index==_sentence.sentence_index]:
+            error_list.append((code_ent.wordOffset_begin,code_ent.wordOffset_end))
+        res_set = set()
+        for i in re.finditer(phone, sentence_text):
+            is_continue = False
+            for error_ent in error_list:
+                if i.start()>=error_ent[0] and i.end()<=error_ent[1]:
+                    is_continue = True
+                    break
+            if is_continue:
+                continue
+            res_set.add((i.group(), i.start(), i.end()))
+        res_set = sorted(list(res_set),key=lambda x:x[1])
+        last_phone_mask = True
+        for item_idx in range(len(res_set)):
+            item = res_set[item_idx]
+            phone_left = sentence_text[max(0, item[1] - 10):item[1]]
+            phone_right = sentence_text[item[2]:item[2] + 8]
+            if re.search("电话|手机|联系人|联系方式”",re.sub(",","",phone_left)):
+                pass
+            else:
+                # 排除“传真号”和其它错误项
+                if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
+                    if not re.search("电,?话", phone_left):
+                        last_phone_mask = False
+                        continue
+                if re.search("注册[证号]|帐号|编[号码]|报价|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
+                    last_phone_mask = False
+                    continue
+                if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+|元", phone_right):
+                    last_phone_mask = False
+                    continue
+                # 前后跟着字母
+                if re.search("[a-zA-Z/]+$", phone_left) or re.search("^[a-zA-Z/]+", phone_right):
+                    last_phone_mask = False
+                    continue
+                # 前后跟着长度小于一定值数字的正则排除
+                if re.search("\d+[-—-―]?\d*$",phone_left) or re.search("^\d+[-—-―]?\d*",phone_right):
+                    phone_left_number = re.search("\d+[-—-―]?\d*$",phone_left)
+                    phone_right_number = re.search("^\d+[-—-―]?\d+",phone_right)
+                    if phone_left_number:
+                        if len(phone_left_number.group())<7:
+                            last_phone_mask = False
+                            continue
+                    if phone_right_number:
+                        if len(phone_right_number.group())<7:
+                            last_phone_mask = False
+                            continue
+                # if:上一个phone实体不符合条件
+                if not last_phone_mask:
+                    item_start = item[1]
+                    last_item_end = res_set[item_idx-1][2]
+                    if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―、]+$",sentence_text[last_item_end:item_start]):
+                        last_phone_mask = False
+                        continue
+            for j in range(len(list_tokenbegin)):
+                if list_tokenbegin[j] == item[1]:
+                    begin_index = j
+                    break
+                elif list_tokenbegin[j] > item[1]:
+                    begin_index = j - 1
+                    break
+            for j in range(begin_index, len(list_tokenbegin)):
+                if list_tokenbegin[j] >= item[2]:
+                    end_index = j - 1
+                    break
+            _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
+                             item[2],in_attachment=in_attachment)
+            phone_entitys.append(_entity)
+            last_phone_mask = True
+
+    def is_company(entity,text):
+        # 判断"公司"实体是否为地址地点
+        if entity.label!=5 and entity.values[entity.label]>0.5:
+            return True
+        if ent.is_tail==True:
+            return False
+        entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
+        entity_left = re.sub(",()\(\)","",entity_left)
+        entity_left = entity_left[-5:]
+        if re.search("地址|地点|银行[::]",entity_left):
+            return False
+        else:
+            return True
+    pre_entity = []
+    for ent in list_entity:
+        if (ent.entity_type in ['company','org','phone'] and is_company(ent,list_sentence[ent.sentence_index].sentence_text)) or (ent.entity_type=='person' and ent.label in [1,2,3]) \
+                or (ent.entity_type=='location' and len(ent.entity_text)>5):
+            pre_entity.append(ent)
+    text_data,pre_data = relationExtraction_model.encode(pre_entity + phone_entitys, list_sentence)
+    # print(pre_data)
+    maxlen = 512
+    relation_list = []
+    if 0<len(text_data)<=maxlen:
+        relation_list = relationExtraction_model.predict(text_data, pre_data)
+    else:
+        # 公告大于maxlen时,分段预测
+        start = 0
+        # print("len(pre_data)",len(pre_data))
+        temp_data = []
+        deal_data = 0
+        while start<len(pre_data):
+            _pre_data = pre_data[start:start+maxlen]
+            _text_data = text_data[start:start+maxlen]
+            if relationExtraction_model.check_data(_pre_data):
+                temp_data.append((_text_data,_pre_data))
+            else:
+                if temp_data:
+                    deal_data += len(temp_data)
+                    if deal_data>4:
+                        break
+                    for _text_data, _pre_data in temp_data:
+                        relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
+                    temp_data = []
+            start = start + maxlen - 120
+        # print("预测数据:",len(temp_data))
+        # 去重结果
+        relation_list = list(set(relation_list))
+    # print(relation_list)
+    right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
+    linked_company = set()
+    linked_person = set()
+    linked_connetPerson = set()
+    linked_phone = set()
+    for predicate in ["rel_address","rel_phone","rel_person"]:
+        _match_list = []
+        _match_combo = []
+        for relation in relation_list:
+            _subject = relation[0]
+            _object = relation[2]
+            if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination:
+                if relation[1]==predicate:
+                    if predicate=="rel_person":
+                        if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
+                            continue
+                    distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
+                                tokens_num_dict[_subject.sentence_index] + _subject.end_index)
+                    if distance>0:
+                        value = (-1 / 2 * (distance ** 2))/10000
+                    else:
+                        distance = abs(distance)
+                        value = (-1 / 2 * (distance ** 2))
+                    _match_list.append(Match(_subject,_object,value))
+                    _match_combo.append((_subject,_object))
+        match_result = dispatch(_match_list)
+        error_list = []
+        for mat in list(set(_match_combo)-set(match_result)):
+            for temp in match_result:
+                if mat[1]==temp[1] and mat[0]!=temp[0]:
+                    error_list.append(mat)
+                    break
+        result = list(set(_match_combo)-set(error_list))
+        if predicate=='rel_person':
+            # 从后往前更新状态,已近后向链接的属性不在前向链接(解决错误链接)
+            result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
+            for combo in result:
+                is_continue = False
+                if not combo[0].pointer_person:
+                    combo[0].pointer_person = []
+                if combo[1].begin_index<combo[0].begin_index:
+                    if combo[0].pointer_person:
+                        for temp in combo[0].pointer_person:
+                            if temp.begin_index>combo[0].begin_index:
+                                is_continue = True
+                                break
+                if is_continue: continue
+                combo[0].pointer_person.append(combo[1])
+                linked_company.add(combo[0])
+                linked_person.add(combo[1])
+                # print(1,combo[0].entity_text,combo[1].entity_text)
+        if predicate=='rel_address':
+            result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
+            for combo in result:
+                if combo[0].pointer_address:
+                    continue
+                combo[0].pointer_address = combo[1]
+                # print(2,combo[0].entity_text,combo[1].entity_text)
+        if predicate=='rel_phone':
+            result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
+            for combo in result:
+                is_continue = False
+                if not combo[0].person_phone:
+                    combo[0].person_phone = []
+                if combo[1].begin_index<combo[0].begin_index:
+                    if combo[0].person_phone:
+                        for temp in combo[0].person_phone:
+                            if temp.begin_index>combo[0].begin_index:
+                                is_continue = True
+                                break
+                if is_continue: continue
+                combo[0].person_phone.append(combo[1])
+                linked_connetPerson.add(combo[0])
+                linked_phone.add(combo[1])
+                if combo[0].label in [1,2]:
+                    if PackDict.get("Project"):
+                        for i in range(len(PackDict["Project"]["roleList"])):
+                            if (combo[0].label==1 and PackDict["Project"]["roleList"][i].role_name=='tenderee') \
+                                    or (combo[0].label==2 and PackDict["Project"]["roleList"][i].role_name=='agency'):
+                                PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
+                                break
+                # print(3,combo[0].entity_text,combo[1].entity_text)
+    # "联系人——联系电话" 链接规则补充
+    person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
+    person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
+    t_match_list = []
+    for ent_idx in range(len(person_phone_EntityList)):
+        entity = person_phone_EntityList[ent_idx]
+        if entity.entity_type=="person":
+            match_nums = 0
+            person_nums = 0  # 经过其他中联系人的数量
+            byNotPerson_match_nums = 0  # 跟在联系人后面的属性
+            phone_nums = 0 # 经过电话的数量
+            for after_index in range(ent_idx + 1, min(len(person_phone_EntityList), ent_idx + 8)):
+                after_entity = person_phone_EntityList[after_index]
+                if after_entity.entity_type == "phone":
+                    distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                            tokens_num_dict[entity.sentence_index] + entity.end_index)
+                    phone_nums += 1
+                    if distance>100 or phone_nums>=4:
+                        break
+                    sentence_distance = after_entity.sentence_index - entity.sentence_index
+                    value = (-1 / 2 * (distance ** 2)) / 10000
+                    if sentence_distance == 0:
+                        if distance < 80:
+                            # value = (-1 / 2 * (distance ** 2)) / 10000
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if not person_nums:
+                                byNotPerson_match_nums += 1
+                            else:
+                                break
+                    else:
+                        if distance < 50:
+                            # value = (-1 / 2 * (distance ** 2)) / 10000
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if not person_nums:
+                                byNotPerson_match_nums += 1
+                            else:
+                                break
+                else:
+                    person_nums += 1
+            # 前向查找属性
+            if ent_idx != 0 and (not match_nums or not byNotPerson_match_nums):
+                previous_entity = person_phone_EntityList[ent_idx - 1]
+                if previous_entity.entity_type == 'phone':
+                    # if previous_entity.sentence_index == entity.sentence_index:
+                    distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
+                            tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index)
+                    if distance < 40:
+                        # 前向 没有 /10000
+                        value = (-1 / 2 * (distance ** 2))
+                        t_match_list.append(Match(entity, previous_entity, value))
+    # km算法分配求解(person-phone)
+    t_match_list = [mat for mat in t_match_list if mat.main_role not in linked_connetPerson and mat.attribute not in linked_phone]
+    personphone_result = dispatch(t_match_list)
+    personphone_result = sorted(personphone_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
+    for match in personphone_result:
+        _person = match[0]
+        _phone = match[1]
+        if not _person.person_phone:
+            _person.person_phone = []
+        _person.person_phone.append(_phone)
+    # 多个招标人/代理人或者别称
+    for idx in range(1,len(pre_entity)):
+        _pre_entity = pre_entity[idx]
+        if _pre_entity in linked_company and _pre_entity.label==5:
+            last_ent = pre_entity[idx-1]
+            if last_ent.entity_type in ['company','org'] and last_ent.label in [0,1]:
+                if last_ent.sentence_index==_pre_entity.sentence_index:
+                    mid_text = list_sentence[_pre_entity.sentence_index].sentence_text[last_ent.wordOffset_end:_pre_entity.wordOffset_begin]
+                    if len(mid_text)<=20 and "," not in mid_text and re.search("[、\((]",mid_text):
+                        _pre_entity.label = last_ent.label
+                        _pre_entity.values[last_ent.label] = 0.6
+    # 2022/01/25 固定电话可连多个联系人
+    temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person']
+    temp_person_entitys2 = [] #和固定电话相连的联系人
+    for entity in temp_person_entitys:
+        if entity.person_phone:
+            for _phone in entity.person_phone:
+                if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
+                    temp_person_entitys2.append(entity)
+                    break
+    for index in range(len(temp_person_entitys)):
+        entity = temp_person_entitys[index]
+        if entity in temp_person_entitys2:
+            last_person = entity
+            for after_index in range(index + 1, min(len(temp_person_entitys), index + 5)):
+                after_entity = temp_person_entitys[after_index]
+                if after_entity.sentence_index == last_person.sentence_index and after_entity.begin_index - last_person.end_index < 3:
+                    for _phone in entity.person_phone:
+                        if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
+                            if _phone not in after_entity.person_phone:
+                                after_entity.person_phone.append(_phone)
+                    last_person = after_entity
+                else:
+                    break
+            if index==0:
+                continue
+            last_person = entity
+            for before_index in range(index-1, max(-1,index-5), -1):
+                before_entity = temp_person_entitys[before_index]
+                if before_entity.sentence_index == last_person.sentence_index and last_person.begin_index - before_entity.end_index < 3:
+                    for _phone in entity.person_phone:
+                        if not re.search("^1[3-9]\d{9}$", _phone.entity_text):
+                            if _phone not in before_entity.person_phone:
+                                before_entity.person_phone.append(_phone)
+                    last_person = before_entity
+                else:
+                    break
+    # 更新person为招标/代理联系人的联系方式
+    for k in PackDict.keys():
+        for i in range(len(PackDict[k]["roleList"])):
+            if PackDict[k]["roleList"][i].role_name == "tenderee":
+                for _person in person_list:
+                    if _person.label==1:#招标联系人
+                        person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
+                        for _p in person_phone:
+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
+                        if not person_phone:
+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
+            if PackDict[k]["roleList"][i].role_name == "agency":
+                for _person in person_list:
+                    if _person.label==2:#代理联系人
+                        person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
+                        for _p in person_phone:
+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
+                        if not person_phone:
+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
+    # 更新 PackDict
+    not_sure_linked = []
+    for link_p in list(linked_company):
+        for k in PackDict.keys():
+            for i in range(len(PackDict[k]["roleList"])):
+                if PackDict[k]["roleList"][i].role_name == "tenderee":
+                    if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 0:
+                        not_sure_linked.append(link_p)
+                        continue
+                    if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
+                        for per in link_p.pointer_person:
+                            person_phone = [phone for phone in per.person_phone] if per.person_phone else []
+                            if not person_phone:
+                                if per.entity_text not in agency_contact:
+                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
+                                    continue
+                            for _p in person_phone:
+                                if per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
+                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
+                elif PackDict[k]["roleList"][i].role_name == "agency":
+                    if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 1:
+                        not_sure_linked.append(link_p)
+                        continue
+                    if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
+                        for per in link_p.pointer_person:
+                            person_phone = [phone for phone in per.person_phone] if per.person_phone else []
+                            if not person_phone:
+                                if per.entity_text not in tenderee_contact:
+                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
+                                    continue
+                            for _p in person_phone:
+                                if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone:
+                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
+                else:
+                    if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
+                        for per in link_p.pointer_person:
+                            person_phone = [phone for phone in per.person_phone] if per.person_phone else []
+                            if not person_phone:
+                                if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact:
+                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
+                                    winter_contact.add(per.entity_text)
+                                    continue
+                            for _p in person_phone:
+                                if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \
+                                        per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
+                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
+                                    winter_contact.add(per.entity_text)
+    # 更新org/company实体label为0,1的链接
+    for link_p in not_sure_linked:
+        for k in PackDict.keys():
+            for i in range(len(PackDict[k]["roleList"])):
+                if PackDict[k]["roleList"][i].role_name == "tenderee":
+                    if link_p.label == 0:
+                        for per in link_p.pointer_person:
+                            person_phone = [phone for phone in per.person_phone] if per.person_phone else []
+                            if not person_phone:
+                                if per.entity_text not in agency_contact and per.entity_text not in winter_contact:
+                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
+                                    continue
+                            for _p in person_phone:
+                                if per.entity_text not in agency_contact and _p.entity_text not in agency_phone and per.entity_text not in winter_contact:
+                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
+                elif PackDict[k]["roleList"][i].role_name == "agency":
+                    if link_p.label == 1:
+                        for per in link_p.pointer_person:
+                            person_phone = [phone for phone in per.person_phone] if per.person_phone else []
+                            if not person_phone:
+                                if per.entity_text not in tenderee_contact and per.entity_text not in winter_contact:
+                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
+                                    continue
+                            for _p in person_phone:
+                                if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
+                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
+
+    re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
+    split_list = [0] * 16
+    split_dict = {
+        "一、": 1,
+        "二、": 2,
+        "三、": 3,
+        "四、": 4,
+        "五、": 5,
+        "六、": 6,
+        "七、": 7,
+        "八、": 8,
+        "九、": 9,
+        "十、": 10,
+        "十一、": 11,
+        "十二、": 12,
+        "十三、": 13,
+        "十四、": 14,
+        "十五、": 15
+    }
+
+    for item in re.finditer(re_split, _content):
+        _index = split_dict.get(item.group()[1:])
+        if not split_list[_index]:
+            split_list[_index] = item.span()[0] + 1
+    split_list = [i for i in split_list if i != 0]
+    start = 0
+    new_split_list = []
+    for idx in split_list:
+        new_split_list.append((start, idx))
+        start = idx
+    new_split_list.append((start, len(_content)))
+    # 实体列表按照“公告分段”分组
+    words_num_dict = dict()
+    last_words_num = 0
+    for sentence in list_sentence:
+        _index = sentence.sentence_index
+        if _index == 0:
+            words_num_dict[_index] = 0
+        else:
+            words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
+        last_words_num = len(sentence.sentence_text)
+
+    # 公司-联系人连接(km算法)
+    re_phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
+                       '\+86.?1[3-9]\d{9}|'
+                       '0[1-9]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
+                       '0[1-9]\d{1,2}[-—-―]\d{7,8}.?转\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―]\d{7,8}[-—-―]\d{1,4}|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=1[3-9]\d{9})|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?)|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
+                       '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
+                       '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
+                       '[2-9]\d{6,7}')
+    key_phone = re.compile("联系方式|电话|联系人|负责人")
+    temporary_list2 = []
+    for entity in list_entity:
+        # if entity.entity_type in ['org', 'company', 'person'] and entity.is_tail==False:
+        if entity.entity_type in ['org', 'company', 'person']:
+            temporary_list2.append(entity)
+    temporary_list2 = sorted(temporary_list2, key=lambda x: (x.sentence_index, x.begin_index))
+    new_temporary_list2 = []
+    for _split in new_split_list:
+        temp_list = []
+        for _entity in temporary_list2:
+            if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
+                _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
+                temp_list.append(_entity)
+            elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
+                break
+        new_temporary_list2.append(temp_list)
+    # print(new_temporary_list2)
+    match_list2 = []
+    for split_index in range(len(new_temporary_list2)):
+        split_entitys = new_temporary_list2[split_index]
+        is_skip = False
+        for index in range(len(split_entitys)):
+            entity = split_entitys[index]
+            if is_skip:
+                is_skip = False
+                continue
+            else:
+                if entity.entity_type in ['org', 'company']:
+                    if entity.label != 5 or entity.entity_text in roleSet:
+                        match_nums = 0
+                        for after_index in range(index + 1, min(len(split_entitys), index + 4)):
+                            after_entity = split_entitys[after_index]
+                            if after_entity.entity_type in ['person']:
+                                # 实体为中标人/候选人,联系人已确定类别【1,2】
+                                if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
+                                    break
+                                if after_entity.label in [1, 2, 3]:
+                                    distance = (tokens_num_dict[
+                                                    after_entity.sentence_index] + after_entity.begin_index) - (
+                                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
+                                    sentence_distance = after_entity.sentence_index - entity.sentence_index
+                                    if sentence_distance == 0:
+                                        if distance < 100:
+                                            if (entity.label == 0 and after_entity.label == 1) or (
+                                                    entity.label == 1 and after_entity.label == 2):
+                                                distance = distance / 100
+                                            value = (-1 / 2 * (distance ** 2)) / 10000
+                                            match_list2.append(Match(entity, after_entity, value))
+                                            match_nums += 1
+                                    else:
+                                        if distance < 60:
+                                            if (entity.label == 0 and after_entity.label == 1) or (
+                                                    entity.label == 1 and after_entity.label == 2):
+                                                distance = distance / 100
+                                            value = (-1 / 2 * (distance ** 2)) / 10000
+                                            match_list2.append(Match(entity, after_entity, value))
+                                            match_nums += 1
+                            if after_entity.entity_type in ['org', 'company']:
+                                # 解决在‘地址’中识别出org/company的问题
+                                # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]:
+                                if entity.label != 5 and after_index == index + 1 and (
+                                        after_entity.label == entity.label or after_entity.label == 5):
+                                    distance = (tokens_num_dict[
+                                                    after_entity.sentence_index] + after_entity.begin_index) - (
+                                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
+                                    if distance < 20:
+                                        after_entity_left = list_sentence[after_entity.sentence_index].tokens[max(0,
+                                                                                                                  after_entity.begin_index - 10):after_entity.begin_index]
+                                        after_entity_right = list_sentence[after_entity.sentence_index].tokens[
+                                                             after_entity.end_index + 1:after_entity.end_index + 6]
+                                        after_entity_left = "".join(after_entity_left)
+                                        if len(after_entity_left) > 20:
+                                            after_entity_left = after_entity_left[-20:]
+                                        after_entity_right = "".join(after_entity_right)[:10]
+                                        if re.search("地,?址", after_entity_left):
+                                            is_skip = True
+                                            continue
+                                        if re.search("\(|(", after_entity_left) and re.search("\)|)",
+                                                                                              after_entity_right):
+                                            is_skip = True
+                                            continue
+                                if entity.label in [0, 1] and after_entity.label in [0,
+                                                                                     1] and entity.label == after_entity.label:
+                                    break
+                                if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[
+                                    index + 1].entity_type == "person":
+                                    break
+                                if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
+                                    break
+                                if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
+                                    break
+                        # 搜索没有联系人的电话
+                        mid_tokens = []
+                        is_same_sentence = False
+                        if index == len(split_entitys) - 1:
+                            for i in range(entity.sentence_index, len(list_sentence)):
+                                mid_tokens += list_sentence[i].tokens
+                            mid_tokens = mid_tokens[entity.end_index + 1:]
+                            mid_sentence = "".join(mid_tokens)
+                            have_phone = re.findall(re_phone, mid_sentence)
+                            if have_phone:
+                                if re.findall(re_phone, mid_sentence.split("。")[0]):
+                                    is_same_sentence = True
+                                _phone = have_phone[0]
+                                phone_begin = mid_sentence.find(_phone)
+                                if words_num_dict[entity.sentence_index] + entity.wordOffset_begin + phone_begin < \
+                                        new_split_list[split_index][1]:
+                                    mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
+                                    if re.search(key_phone, mid_sentence):
+                                        distance = 1
+                                        if is_same_sentence:
+                                            if phone_begin <= 200:
+                                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                                match_list2.append(Match(entity, (entity, _phone), value))
+                                                match_nums += 1
+                                        else:
+                                            if phone_begin <= 60:
+                                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                                match_list2.append(Match(entity, (entity, _phone), value))
+                                                match_nums += 1
+                        else:
+                            next_entity = split_entitys[index + 1]
+                            if next_entity.entity_type in ["org","company"]:
+                                _entity_left = list_sentence[next_entity.sentence_index].sentence_text[max(0, next_entity.wordOffset_begin - 20):next_entity.wordOffset_begin]
+                                _entity_left2 = re.sub(",()\(\)::", "", _entity_left)
+                                _entity_left2 = _entity_left2[-5:]
+                                if re.search("(地,?址|地,?点)[::][^,。]*$", _entity_left) or re.search("地址|地点", _entity_left2):
+                                    if index + 2<= len(split_entitys) - 1:
+                                        next_entity = split_entitys[index + 2]
+                            if entity.sentence_index == next_entity.sentence_index:
+                                mid_tokens += list_sentence[entity.sentence_index].tokens[
+                                              entity.end_index + 1:next_entity.begin_index]
+                            else:
+                                sentence_index = entity.sentence_index
+                                while sentence_index <= next_entity.sentence_index:
+                                    mid_tokens += list_sentence[sentence_index].tokens
+                                    sentence_index += 1
+                                mid_tokens = mid_tokens[entity.end_index + 1:-(len(
+                                    list_sentence[next_entity.sentence_index].tokens) - next_entity.begin_index) + 1]
+                            mid_sentence = "".join(mid_tokens)
+                            have_phone = re.findall(re_phone, mid_sentence)
+                            if have_phone:
+                                if re.findall(re_phone, mid_sentence.split("。")[0]):
+                                    is_same_sentence = True
+                                _phone = have_phone[0]
+                                phone_begin = mid_sentence.find(_phone)
+                                mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
+                                if re.search(key_phone, mid_sentence):
+                                    p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
+                                    if next_entity.entity_type == 'person' and _phone in p_phone:
+                                        pass
+                                    else:
+                                        distance = (tokens_num_dict[
+                                                        next_entity.sentence_index] + next_entity.begin_index) - (
+                                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
+                                        distance = distance / 2
+                                        if is_same_sentence:
+                                            if phone_begin <= 200:
+                                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                                match_list2.append(Match(entity, (entity, _phone), value))
+                                                match_nums += 1
+                                        else:
+                                            if phone_begin <= 60:
+                                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                                match_list2.append(Match(entity, (entity, _phone), value))
+                                                match_nums += 1
+                        # 实体无匹配时,尝试前向查找匹配
+                        if not match_nums:
+                            if (entity.label != 5 or entity.entity_text in roleSet) and entity.values[entity.label] >= 0.5 and index != 0:
+                                previous_entity = split_entitys[index - 1]
+                                if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
+                                    if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
+                                        continue
+                                    if previous_entity.sentence_index == entity.sentence_index:
+                                        distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
+                                                tokens_num_dict[
+                                                    previous_entity.sentence_index] + previous_entity.end_index)
+                                        if distance < 20:
+                                            # 距离相等时,前向添加处罚值
+                                            # distance += 1
+                                            # 前向 没有 /10000
+                                            value = (-1 / 2 * (distance ** 2))
+                                            match_list2.append(Match(entity, previous_entity, value))
+    # print(match_list2)
+    match_list2 = [mat for mat in match_list2 if mat.main_role not in linked_company and mat.attribute not in linked_person]
+    # print(match_list2)
+    # km算法分配求解
+    result2 = dispatch(match_list2)
+    # print(result2)
+    for match in result2:
+        entity = match[0]
+        # print(entity.entity_text)
+        # print(match.attribute)
+        entity_index = list_entity.index(entity)
+        is_update = False
+        if isinstance(match[1], tuple):
+            person_ = ''
+            phone_ = match[1][1].split("/") # 分割多个号码
+            # print(person_,phone_)
+        else:
+            person_ = match[1].entity_text
+            phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
+        for k in PackDict.keys():
+            for i in range(len(PackDict[k]["roleList"])):
+                if PackDict[k]["roleList"][i].role_name == "tenderee":
+                    # if not PackDict[k]["roleList"][i].linklist:
+                        if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0:
+                            if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0 and person_ not in winter_contact:
+                                if not phone_:
+                                    PackDict[k]["roleList"][i].linklist.append((person_, ""))
+                                for p in phone_:
+                                    # if not person_ and len()
+                                    PackDict[k]["roleList"][i].linklist.append((person_, p))
+                                is_update = True
+                elif PackDict[k]["roleList"][i].role_name == "agency":
+                    # if not PackDict[k]["roleList"][i].linklist:
+                        if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1 and person_ not in winter_contact:
+                            if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0:
+                                if not phone_:
+                                    PackDict[k]["roleList"][i].linklist.append((person_, ""))
+                                for p in phone_:
+                                    PackDict[k]["roleList"][i].linklist.append((person_, p))
+                                is_update = True
+                else:
+                    if PackDict[k]["roleList"][i].entity_text == entity.entity_text:
+                        if not PackDict[k]["roleList"][i].linklist:
+                            if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0 and \
+                                    person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0:
+                                if not phone_:
+                                    PackDict[k]["roleList"][i].linklist.append((person_, ""))
+                                for p in phone_:
+                                    PackDict[k]["roleList"][i].linklist.append((person_, p))
+                                is_update = True
+        if not person_:
+            is_update = False
+        if is_update:
+            # 更新 list_entity
+            if not list_entity[entity_index].pointer_person:
+                list_entity[entity_index].pointer_person = []
+            list_entity[entity_index].pointer_person.append(match[1])
+
+    linked_person = []
+    linked_persons_with = []
+    for company_entity in [entity for entity in list_entity if entity.entity_type in ['company','org']]:
+        if company_entity.pointer_person:
+            for _person in company_entity.pointer_person:
+                linked_person.append(_person)
+                linked_persons_with.append(company_entity)
+    # 一个公司对应多个联系人的补充
+    person_entitys = [entity for entity in list_entity if entity.entity_type=='person']
+    person_entitys = person_entitys[::-1]
+    for index in range(len(person_entitys)):
+        entity = person_entitys[index]
+        prepare_link = []
+        if entity not in linked_person:
+            prepare_link.append(entity)
+            last_person = entity
+            for after_index in range(index + 1, min(len(person_entitys), index + 5)):
+                after_entity = person_entitys[after_index]
+                if after_entity.sentence_index==last_person.sentence_index and last_person.begin_index-after_entity.end_index<5:
+                    if after_entity in linked_person:
+                        _index = linked_person.index(after_entity)
+                        with_company = linked_persons_with[_index]
+                        for i in range(len(PackDict["Project"]["roleList"])):
+                            if PackDict["Project"]["roleList"][i].role_name == "tenderee":
+                                if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 0:
+                                    for item in prepare_link:
+                                        person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
+                                        for _p in person_phone:
+                                            PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
+                                        with_company.pointer_person.append(item)
+                                        linked_person.append(item)
+                            elif PackDict["Project"]["roleList"][i].role_name == "agency":
+                                if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 1:
+                                    for item in prepare_link:
+                                        person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
+                                        for _p in person_phone:
+                                            PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
+                                        with_company.pointer_person.append(item)
+                                        linked_person.append(item)
+                            else:
+                                if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text:
+                                    for item in prepare_link:
+                                        person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
+                                        for _p in person_phone:
+                                            PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
+                                        with_company.pointer_person.append(item)
+                                        linked_person.append(item)
+                        break
+                    else:
+                        prepare_link.append(after_entity)
+                        last_person = after_entity
+                        continue
+
+    # 统一同类角色的属性
+    if PackDict.get("Project"):
+        for i in range(len(PackDict["Project"]["roleList"])):
+            # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
+            for _entity in list_entity:
+                if _entity.entity_type in ['org','company']:
+                    is_similar = False
+                    # entity_text相同
+                    if _entity.entity_text==PackDict["Project"]["roleList"][i].entity_text:
+                        is_similar = True
+                    # entity.label为【0,1】
+                    if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict["Project"]["roleList"][i].role_name:
+                        is_similar = True
+                    if is_similar:
+                        linked_entitys = _entity.linked_entitys
+                        if linked_entitys:
+                            for linked_entity in linked_entitys:
+                                pointer_person = linked_entity.pointer_person if linked_entity.pointer_person else []
+                                for _pointer_person in pointer_person:
+                                    _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
+                                    for _p in _phone:
+                                        if (_pointer_person.entity_text,_p) not in PackDict["Project"]["roleList"][i].linklist:
+                                            PackDict["Project"]["roleList"][i].linklist.append((_pointer_person.entity_text,_p))
+
+    # "roleList"中联系人电话去重
+    for i in range(len(PackDict["Project"]["roleList"])):
+        # print(123, PackDict["Project"]["roleList"][i].linklist)
+        # 带有联系人的电话
+        with_person = [person_phone[1] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[0]]
+        # 带有电话的联系人
+        with_phone = [person_phone[0] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[1]]
+        remove_list = []
+        for item in PackDict["Project"]["roleList"][i].linklist:
+            if not item[0]:
+                if item[1] in with_person:
+                    # 删除重复的无联系人电话
+                    remove_list.append(item)
+            elif not item[1]:
+                if item[0] in with_phone:
+                    remove_list.append(item)
+        for _item in remove_list:
+            PackDict["Project"]["roleList"][i].linklist.remove(_item)
+
+    # 联系人——电子邮箱链接
+    temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
+    temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index))
+    new_temporary_list3 = []
+    for _split in new_split_list:
+        temp_list = []
+        for _entity in temporary_list3:
+            if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
+                _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
+                temp_list.append(_entity)
+            elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
+                break
+        new_temporary_list3.append(temp_list)
+    # print(new_temporary_list3)
+    match_list3 = []
+    for split_index in range(len(new_temporary_list3)):
+        split_entitys = new_temporary_list3[split_index]
+        for index in range(len(split_entitys)):
+            entity = split_entitys[index]
+            if entity.entity_type == 'person':
+                match_nums = 0
+                for after_index in range(index + 1, min(len(split_entitys), index + 4)):
+                    after_entity = split_entitys[after_index]
+                    if match_nums > 2:
+                        break
+                    if after_entity.entity_type == 'email':
+                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
+                        sentence_distance = after_entity.sentence_index - entity.sentence_index
+                        if sentence_distance == 0:
+                            if distance < 100:
+                                if (entity.label == 0 and after_entity.label == 1) or (
+                                        entity.label == 1 and after_entity.label == 2):
+                                    distance = distance / 100
+                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                match_list3.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                        else:
+                            if distance < 60:
+                                if (entity.label == 0 and after_entity.label == 1) or (
+                                        entity.label == 1 and after_entity.label == 2):
+                                    distance = distance / 100
+                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                match_list3.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                # 前向查找匹配
+                # if not match_nums:
+                if index != 0:
+                    previous_entity = split_entitys[index - 1]
+                    if previous_entity.entity_type == 'email':
+                        if previous_entity.sentence_index == entity.sentence_index:
+                            distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
+                                    tokens_num_dict[
+                                        previous_entity.sentence_index] + previous_entity.end_index)
+                            if distance < 30:
+                                # 距离相等时,前向添加处罚值
+                                # distance += 1
+                                # 前向 没有 /10000
+                                value = (-1 / 2 * (distance ** 2))
+                                match_list3.append(Match(entity, previous_entity, value))
+    # print(match_list3)
+    # km算法分配求解
+    result3 = dispatch(match_list3)
+    for match in result3:
+        match_person = match[0]
+        match_email = match[1]
+        match_person.pointer_email = match_email
+
+    # # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
+    # temp_ent_list = []  # 临时列表,记录0,1角色及3联系人
+    # other_person = []  # 阈值以上的联系人列表
+    # link_person = []   # 有电话没联系上角色的person列表
+    # other_ent = []
+    # link_ent = []
+    # found_person = False
+    # ent_list = []
+    # for entity in list_entity:
+    #     if entity.entity_type in ['org','company','person']:
+    #         ent_list.append(entity)
+    # # ent_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
+    # #for list_index in range(len(ent_list)):
+    #     #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
+    #        #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
+    #         #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
+    # # 2020/11/25增加确定角色联系人判断
+    # sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
+    # # 招标/代理在同一句中交叉情况的处理
+    # for index in range(len(ent_list)):
+    #     entity = ent_list[index]
+    #     if entity.entity_text in roleSet and entity.label in [0, 1] and index+3<len(ent_list):
+    #         if entity.sentence_index==ent_list[index+1].sentence_index==ent_list[index+2].sentence_index==ent_list[index+3].sentence_index:
+    #             if ent_list[index+1].begin_index - entity.end_index < 30:
+    #                 if ent_list[index+1].entity_text in roleSet and ent_list[index+1].label in [0, 1] and entity.label!=ent_list[index+1].label:
+    #                     if ent_list[index+2].entity_type=="person" and ent_list[index+3].entity_type=="person" and \
+    #                             ent_list[index+2].label==3 and ent_list[index+3].label==3:
+    #                         ent_list[index + 1], ent_list[index + 2] = ent_list[index + 2], ent_list[index + 1]
+    #
+    #
+    # for index in range(len(ent_list)):
+    #     entity = ent_list[index]
+    #     if entity.entity_type=="person":
+    #         if str(entity.label) == "0":  # 2020/11/25 非联系人直接跳过
+    #             continue
+    #         if entity.values[entity.label]>on_value_person:
+    #             if str(entity.label)=="1":
+    #                 for i in range(len(PackDict["Project"]["roleList"])):
+    #                     if PackDict["Project"]["roleList"][i].role_name=="tenderee":
+    #                         PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
+    #                         link_person.append(entity.entity_text)
+    #                         link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
+    #                 # add pointer_person
+    #                 for _entity in list_entity:
+    #                     if dict_role_id.get(str(_entity.label))=="tenderee":
+    #                         for i in range(len(PackDict["Project"]["roleList"])):
+    #                             if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
+    #                                 _entity.pointer_person = entity
+    #             elif str(entity.label)=="2":
+    #                 for i in range(len(PackDict["Project"]["roleList"])):
+    #                     if PackDict["Project"]["roleList"][i].role_name=="agency":
+    #                         PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
+    #                         link_person.append(entity.entity_text)
+    #                         link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
+    #                 # add pointer_person
+    #                 for _entity in list_entity:
+    #                     if dict_role_id.get(str(_entity.label))=="agency":
+    #                         for i in range(len(PackDict["Project"]["roleList"])):
+    #                             if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
+    #                                 _entity.pointer_person = entity
+    #             elif str(entity.label)=="3":
+    #                 if entity.entity_text in sure_person_set:  # 2020/11/25 排除已经确定角色的联系人
+    #                     continue
+    #                 #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
+    #                 other_person.append(entity.entity_text)
+    #                 temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
+    #
+    #     #if entity.entity_text in roleSet:
+    #     if entity.entity_text in roleSet:
+    #         if entity.label in [0,1]:
+    #             other_ent.append(entity.entity_text)
+    #             temp_ent_list.append((entity.entity_text, entity.label,entity))
+    #         for behind_index in range(index+1, len(ent_list)):
+    #             entity_after = ent_list[behind_index]
+    #             if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']:   # 只在本句中找联系人
+    #                 break
+    #             if entity_after.values is not None:
+    #                 if entity_after.entity_type=="person":
+    #                     if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
+    #                         break
+    #                     if entity_after.values[entity_after.label]>on_value_person:
+    #                         if str(entity_after.label)=="1":
+    #                             for i in range(len(PackDict["Project"]["roleList"])):
+    #                                 if PackDict["Project"]["roleList"][i].role_name=="tenderee":
+    #                                     PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
+    #                                     link_person.append(entity_after.entity_text)
+    #                                     link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
+    #                         elif str(entity_after.label)=="2":
+    #                             for i in range(len(PackDict["Project"]["roleList"])):
+    #                                 if PackDict["Project"]["roleList"][i].role_name=="agency":
+    #                                     PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
+    #                                     link_person.append(entity_after.entity_text)
+    #                                     link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
+    #                         elif str(entity_after.label)=="3":
+    #                             if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
+    #                                 break
+    #                             elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
+    #                                 break
+    #                             for pack in PackDict.keys():
+    #                                 for i in range(len(PackDict[pack]["roleList"])):
+    #                                     if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
+    #                                         #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
+    #                                             #break
+    #                                         PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
+    #                                         link_person.append(entity_after.entity_text)
+    #                                         #add pointer_person
+    #                                         entity.pointer_person = entity_after
+    #
+    # not_link_person = [person for person in other_person if person not in link_person]
+    # not_link_ent = [ent for ent in other_ent if ent not in link_ent]
+    # if len(not_link_person) > 0 and len(not_link_ent) > 0 :
+    #     item = temp_ent_list
+    #     for i in range(len(item)):
+    #         if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
+    #             if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
+    #                 item[i+1], item[i+2] = item[i+2], item[i+1]
+    #     for i in range(len(item)-1, -1, -1):
+    #         if item[i][0] in not_link_ent:
+    #             for pack in PackDict.keys():
+    #                 for role in PackDict[pack]["roleList"]:
+    #                     if role.entity_text == item[i][0] and len(role.linklist) < 1:
+    #                         for j in range(i+1, len(item)):
+    #                             if item[j][0] in not_link_person:
+    #                                 role.linklist.append(item[j][:2])
+    #                                 #add pointer_person
+    #                                 item[i][2].pointer_person = item[j][2]
+    #                                 break
+    #                             else:
+    #                                 break
+    # # 电话没有联系人的处理
+    # role_with_no_phone = []
+    # for i in range(len(PackDict["Project"]["roleList"])):
+    #     if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
+    #         if len(PackDict["Project"]["roleList"][i].linklist)==0: # 找出没有联系人的招标/代理人
+    #             role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
+    #         else:
+    #             phone_nums = 0
+    #             for link in PackDict["Project"]["roleList"][i].linklist:
+    #                 if link[1]:
+    #                     phone_nums += 1
+    #                     break
+    #             if not phone_nums:
+    #                 role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
+    # if role_with_no_phone:
+    #     phone_with_person = [entity.person_phone for entity in list_entity if entity.entity_type == "person"]
+    #     # phone_with_person = [phone for phone in phone_with_person if phone]
+    #
+    #     dict_index_sentence = {}
+    #     for _sentence in list_sentence:
+    #         dict_index_sentence[_sentence.sentence_index] = _sentence
+    #     new_entity_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
+    #     for index in range(len(new_entity_list)):
+    #         entity = new_entity_list[index]
+    #         if entity.entity_text in role_with_no_phone:
+    #             e_sentence = dict_index_sentence[entity.sentence_index]
+    #             entity_right = e_sentence.tokens[entity.end_index:entity.end_index+40]
+    #             entity_right = "".join(entity_right)
+    #             if index+1<len(new_entity_list) and entity_right.find(new_entity_list[index+1].entity_text)>-1:
+    #                 entity_right = entity_right[:entity_right.find(new_entity_list[index+1].entity_text)]
+    #             have_phone = re.findall(phone,entity_right)
+    #             if have_phone:
+    #                 _phone = have_phone[0]
+    #                 phone_begin = entity_right.find(_phone)
+    #                 if _phone not in phone_with_person and re.search(key_phone,entity_right[:phone_begin]):
+    #                     # entity.person_phone = _phone
+    #                     for i in range(len(PackDict["Project"]["roleList"])):
+    #                         if PackDict["Project"]["roleList"][i].entity_text == entity.entity_text:
+    #                             PackDict["Project"]["roleList"][i].linklist.append(('', _phone))
+
+    
+    #寻找多标段招标金额
+    p_entity = len(list_entity)-1
+
+    set_tenderer_money = set()
+    list_tenderer_money = []  #2021/7/16 新增列表,倒序保存所有中标金额
+    unit_list = [] #2021/8/17 新增,保存金额单位
+
+    #遍历所有实体
+    while(p_entity>=0):
+        entity = list_entity[p_entity]
+        if entity.entity_type=="money":
+            # 2021/12/03 添加成本警戒线、保证金
+            if entity.notes in ['保证金', '成本警戒线']:
+                packagePointer, _flag = getPackage(PackageList, entity.sentence_index, entity.begin_index,
+                                                   "money-" + str(entity.label), MAX_DIS=2, DIRECT="L")
+                if packagePointer is None:
+                    packageName = "Project"
+                else:
+                    packageName = packagePointer.entity_text
+
+                if packageName == "Project":
+                    # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
+                    #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
+                    if entity.notes=="保证金" and "bond" not in PackDict["Project"]:
+                        PackDict["Project"]["bond"] = float(entity.entity_text)
+                    elif entity.notes=="成本警戒线" and "cost_warning" not in PackDict["Project"]:
+                        PackDict["Project"]["cost_warning"] = float(entity.entity_text)
+
+                else:
+                    if entity.notes == "保证金" and "bond" not in PackDict[packageName]:
+                        PackDict[packageName]["bond"] = float(entity.entity_text)
+                    elif entity.notes == "成本警戒线" and "cost_warning" not in PackDict[packageName]:
+                        PackDict[packageName]["cost_warning"] = float(entity.entity_text)
+
+            elif entity.values[entity.label]>=on_value:
+                if str(entity.label)=="1":
+                    set_tenderer_money.add(float(entity.entity_text))
+                    list_tenderer_money.append(float(entity.entity_text))  # 2021/7/16 新增列表,倒序保存所有中标金额
+                    unit_list.append(entity.money_unit)
+                # if str(entity.label)=="0":
+                if str(entity.label)=="0" and entity.notes!='总投资':
+                    '''
+                    if p_entity>0:
+                        p_before = list_entity[p_entity-1]
+                        if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
+                            p_entity -= 1
+                            continue
+                    '''
+                    packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
+                    if packagePointer is None:
+                        packageName = "Project"
+                    else:
+                        packageName = packagePointer.entity_text
+                        
+                    if packageName=="Project":
+                        # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
+                        #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
+                        if entity.values[entity.label]>on_value:
+                            PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
+                            PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
+                    else:
+                        PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
+                        PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
+                        #add pointer_tendereeMoney
+                        packagePointer.pointer_tendereeMoney = entity
+        p_entity -= 1            
+    
+        
+    #删除一个机构有多个角色的数据
+    #删除重复人、概率不回传
+    final_roleList = []
+    list_pop = []
+    set_tenderer_role = set()
+    dict_pack_tenderer_money = dict()
+
+    for pack in PackDict.keys():
+        #删除无效包
+        if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
+            list_pop.append(pack)
+        for i in range(len(PackDict[pack]["roleList"])):
+            if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
+                if PackDict[pack]["roleList"][i].money==0:
+                    set_tenderer_role.add(PackDict[pack]["roleList"][i])
+                    dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
+    #找到包的中投标金额
+    for _index in range(len(PackageList)):
+        if "hit" in PackageList[_index]:
+            for _hit in list(PackageList[_index]["hit"]):
+                _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
+                if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
+                    dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
+    #只找到一个中标人和中标金额
+    if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
+        list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
+        list(set_tenderer_role)[0].money_unit = unit_list[0]
+        # print('一个中标人一个金额:', list(set_tenderer_money)[0])
+    #找到一个中标人和多个招标金额
+    if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
+        _maxMoney = 0
+        _sumMoney = 0
+        for _m in list(set_tenderer_money):
+            _sumMoney += _m
+            if _m>_maxMoney:
+                _maxMoney = _m
+        if _sumMoney/_maxMoney==2:
+            list(set_tenderer_role)[0].money = _maxMoney
+            # print('一人多金额分项合计 取最大金额:', _maxMoney)
+        else:
+            # list(set_tenderer_role)[0].money = _maxMoney
+            if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000:
+                list(set_tenderer_role)[0].money = min(list_tenderer_money)
+                list(set_tenderer_role)[0].money_unit = unit_list[list_tenderer_money.index(min(list_tenderer_money))]
+                # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额:', min(list_tenderer_money))
+            else:
+                list(set_tenderer_role)[0].money = list_tenderer_money[-1]  # 2021/7/16 修改 不是单价合计方式取第一个中标金额
+                list(set_tenderer_role)[0].money_unit = unit_list[-1] # 金额单位
+                # print('一人多金额 取第一个中标金额:', list_tenderer_money[-1])
+    #每个包都只找到一个金额
+    _flag_pack_money = True
+    for k,v in dict_pack_tenderer_money.items():
+        if len(v[1])!=1:
+            _flag_pack_money = False
+    if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
+        for k,v in dict_pack_tenderer_money.items():
+            v[0].money = list(v[1])[0]
+            # print('k,v in dict_pack_tenderer_money.items', k, v)
+    # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
+    for pack in PackDict.keys():
+        for i in range(len(PackDict[pack]["roleList"])):
+            if PackDict[pack]["tendereeMoney"] > 0:
+                # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money))
+                if float(PackDict[pack]["roleList"][i].money) >10000000 and \
+                        float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
+                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
+                    # print('招标金额校正中标金额')
+    # 2022/04/01 #增加判断中标金额是否远小于招标金额逻辑,比例相差10000倍左右(中标金额“万”单位丢失或未识别)
+    for pack in PackDict.keys():
+        for i in range(len(PackDict[pack]["roleList"])):
+            if PackDict[pack]["tendereeMoney"] > 0 and float(PackDict[pack]["roleList"][i].money) > 0.:
+                if float(PackDict[pack]["roleList"][i].money) < 1000 and \
+                        float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)>=9995 and \
+                        float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)<11000:
+                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) * 10000
+    # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
+    for pack in PackDict.keys():
+        tmp_moneys = []
+        for i in range(len(PackDict[pack]["roleList"])):
+            if float(PackDict[pack]["roleList"][i].money) >100000:
+                tmp_moneys.append(float(PackDict[pack]["roleList"][i].money))
+        if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000:
+            for i in range(len(PackDict[pack]["roleList"])):
+                if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
+                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
+                    # print('通过其他中标人投标金额校正中标金额')
+    for item in list_pop:
+        PackDict.pop(item)
+    # 公告中只有"招标人"且无"联系人"链接时
+    if len(PackDict)==1:
+        k = list(PackDict.keys())[0]
+        if len(PackDict[k]["roleList"])==1:
+            if PackDict[k]["roleList"][0].role_name == "tenderee":
+                if not PackDict[k]["roleList"][0].linklist:
+                    get_contacts = False
+                    if not get_contacts:
+                        # 根据大纲Outline类召回联系人
+                        for outline in list_outline:
+                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary):
+                                for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]:
+                                    if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[
+                                        t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
+                                        if t_person.person_phone:
+                                            _phone = [p.entity_text for p in t_person.person_phone]
+                                            for _p in _phone:
+                                                PackDict[k]["roleList"][0].linklist.append((t_person.entity_text, _p))
+                                            get_contacts = True
+                                            break
+                                    elif words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= \
+                                            words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
+                                        break
+                                if not get_contacts:
+                                    sentence_phone = phone.findall(outline.outline_text)
+                                    if sentence_phone:
+                                        PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
+                                        get_contacts = True
+                                        break
+                    if not get_contacts:
+                        # 直接取文中倒数第一个联系人
+                        for _entity in temporary_list2[::-1]:
+                            if _entity.entity_type=='person' and _entity.label==3:
+                                if _entity.person_phone:
+                                    _phone = [p.entity_text for p in _entity.person_phone]
+                                    for _p in _phone:
+                                        PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
+                                    get_contacts = True
+                                    break
+                    if not get_contacts:
+                        # 如果文中只有一个“phone”实体,则直接取为联系人电话
+                        if len(phone_entitys) == 1:
+                            PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
+                            get_contacts = True
+                    if not get_contacts:
+                        # 通过大纲Outline类直接取电话
+                        if len(new_split_list) > 1:
+                            for _start, _end in new_split_list:
+                                temp_sentence = _content[_start:_end]
+                                sentence_outline = temp_sentence.split(",::")[0]
+                                if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
+                                    sentence_phone = phone.findall(temp_sentence)
+                                    if sentence_phone:
+                                        PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
+                                        get_contacts = True
+                                        break
+                    if not get_contacts:
+                        # 通过正则提取句子段落进行提取电话
+                        contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[::]?"
+                        tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}"
+                        contact_pattern_list = [tenderee_pattern + contacts_person,
+                                                "(?:采购[^。,]{0,2}项目|采购事项|招标)[^。,]{0,4}" + contacts_person,
+                                                "(?:项目|采购)[^。,]{0,4}" + contacts_person,
+                                                "(?:报名|报价|业务咨询|业务|投标咨询)[^。,]{0,4}" + contacts_person, ]
+                        for _pattern in contact_pattern_list:
+                            get_tenderee_contacts = False
+                            for regular_match in re.finditer(_pattern, _content):
+                                match_text = _content[regular_match.end():regular_match.end() + 40]
+                                match_text = match_text.split("。")[0]
+                                sentence_phone = phone.findall(match_text)
+                                if sentence_phone:
+                                    PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
+                                    get_tenderee_contacts = True
+                                    break
+                            if get_tenderee_contacts:
+                                break
+
+    for pack in PackDict.keys():
+        for i in range(len(PackDict[pack]["roleList"])):
+            PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
+    return PackDict
+
+def initPackageAttr(RoleList,PackageSet):
+    '''
+    @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
+    '''   
+    packDict = dict()
+    packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
+    for item in list(PackageSet):
+        packDict[item] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
+    for item in RoleList:
+        if packDict[item.packageName]["code"] =="":
+            packDict[item.packageName]["code"] = item.packageCode
+        # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
+        packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
+    return packDict
+                
+def getPackageRoleMoney(list_sentence,list_entity,list_outline):
+    '''
+    @param:
+        list_sentence:文章的句子list
+        list_entity:文章的实体list
+    @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话  
+    '''
+    # print("=1")
+    theRole = getRoleList(list_sentence,list_entity)
+    if not theRole:
+        return []
+    RoleList,RoleSet,PackageList,PackageSet = theRole
+    '''
+    for item in PackageList:
+        # print(item)
+    '''
+    PackDict = initPackageAttr(RoleList, PackageSet)
+
+    PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline)
+    return PackDict
+
+def turnBidWay(bidway):
+    if bidway in ("邀请招标","采购方式:邀请"):
+        return "邀请招标"
+    elif bidway in ("询价","询单","询比","采购方式:询价"):
+        return "询价"
+    elif bidway in ("竞谈","竞争性谈判","公开竞谈"):
+        return "竞争性谈判"
+    elif bidway in ("竞争性磋商","磋商"):
+        return "竞争性磋商"
+    elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"):
+        return "竞价"
+    elif bidway in ("公开招标","网上电子投标","网上招标","采购方式:公开","招标为其他"):
+        return "公开招标"
+    elif bidway in ("单一来源"):
+        return "单一来源"
+    elif bidway in ("比选"):
+        return "比选"
+    else:
+        return "其他"
+
+my_time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
+import time
+def my_timeFormat(_time):
+    current_year = time.strftime("%Y",time.localtime())
+    all_match = re.finditer(my_time_format_pattern,_time)
+    time_list = []
+    for _match in all_match:
+        if len(_match.group())>0:
+            legal = True
+            year = ""
+            month = ""
+            day = ""
+            for k,v in _match.groupdict().items():
+                if k=="year":
+                    year = v
+                if k=="month":
+                    month = v
+                if k=="day":
+                    day = v
+            if year!="":
+                if len(year)==2:
+                    year = "20"+year
+                if int(year)>int(current_year):
+                    legal = False
+            else:
+                legal = False
+            if month!="":
+                if int(month)>12:
+                    legal = False
+            else:
+                legal = False
+            if day!="":
+                if int(day)>31:
+                    legal = False
+            else:
+                legal = False
+            if legal:
+                # return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
+                time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
+    return time_list
+
+def getTimeAttributes(list_entity,list_sentence):
+    time_entitys = [i for i in list_entity if i.entity_type=='time']
+    time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
+    list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
+    dict_time = {
+        "time_release": [], # 1 发布时间
+        "time_bidopen": [], # 2 开标时间
+        "time_bidclose": [], # 3 截标时间
+        'time_bidstart': [],  # 12 投标(开始)时间、响应文件接收(开始)时间
+
+        'time_publicityStart': [],  # 4 公示开始时间(公示时间、公示期)
+        'time_publicityEnd': [],  # 5 公示截止时间
+        'time_getFileStart': [],  # 6 文件获取开始时间(文件获取时间)
+        'time_getFileEnd': [],  # 7 文件获取截止时间
+        'time_registrationStart': [],  # 8 报名开始时间(报名时间)
+        'time_registrationEnd': [],  # 9 报名截止时间
+        'time_earnestMoneyStart': [], #10 保证金递交开始时间(保证金递交时间)
+        'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间
+        'time_commencement':[] , #13 开工日期
+        'time_completion': []  # 14 竣工日期
+    }
+    last_sentence_index = 0
+    last_time_type = ""
+    last_time_index = {
+        'time_bidstart':"time_bidclose",
+        'time_publicityStart':"time_publicityEnd",
+        'time_getFileStart':"time_getFileEnd",
+        'time_registrationStart':"time_registrationEnd",
+        'time_earnestMoneyStart':"time_earnestMoneyEnd",
+        'time_commencement':"time_completion",
+    }
+    for entity in time_entitys:
+        sentence_text = list_sentence[entity.sentence_index].sentence_text
+        entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
+        entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
+        label_prob = entity.values[entity.label]
+        entity_text = entity.entity_text
+        extract_time = my_timeFormat(entity_text)
+        if extract_time:
+            if re.search("至|到", entity_left):
+                if entity.sentence_index == last_sentence_index:
+                    time_type = last_time_index.get(last_time_type)
+                    if time_type:
+                        dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10))
+                        last_time_type = ""
+                        continue
+            if entity.label!=0:
+                if entity.label==1 and label_prob>0.5:
+                    dict_time['time_release'].append((extract_time[0],label_prob))
+                    last_time_type = 'time_release'
+                elif entity.label==2 and label_prob>0.5:
+                    dict_time['time_bidopen'].append((extract_time[0],label_prob))
+                    last_time_type = 'time_bidopen'
+                elif entity.label==3 and label_prob>0.5:
+                    dict_time['time_bidclose'].append((extract_time[0],label_prob))
+                    last_time_type = 'time_bidclose'
+                elif entity.label==12 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_bidclose'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_bidclose'
+                        else:
+                            dict_time['time_bidstart'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_bidstart'
+                    else:
+                        dict_time['time_bidstart'].append((extract_time[0],label_prob))
+                        dict_time['time_bidclose'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                elif entity.label==4 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_publicityEnd'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_publicityEnd'
+                        else:
+                            dict_time['time_publicityStart'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_publicityStart'
+                    else:
+                        dict_time['time_publicityStart'].append((extract_time[0],label_prob))
+                        dict_time['time_publicityEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                elif entity.label==5 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        dict_time['time_publicityEnd'].append((extract_time[0], label_prob))
+                        last_time_type = 'time_publicityEnd'
+                    else:
+                        dict_time['time_publicityStart'].append((extract_time[0],label_prob))
+                        dict_time['time_publicityEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                elif entity.label==6 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_getFileEnd'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_getFileEnd'
+                        else:
+                            dict_time['time_getFileStart'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_getFileStart'
+                    else:
+                        dict_time['time_getFileStart'].append((extract_time[0],label_prob))
+                        dict_time['time_getFileEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                elif entity.label==7 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        dict_time['time_getFileEnd'].append((extract_time[0], label_prob))
+                        last_time_type = 'time_getFileEnd'
+                    else:
+                        dict_time['time_getFileStart'].append((extract_time[0],label_prob))
+                        dict_time['time_getFileEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                elif entity.label==8 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_registrationEnd'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_registrationEnd'
+                        else:
+                            dict_time['time_registrationStart'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_registrationStart'
+                    else:
+                        dict_time['time_registrationStart'].append((extract_time[0],label_prob))
+                        dict_time['time_registrationEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                elif entity.label==9 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        dict_time['time_registrationEnd'].append((extract_time[0], label_prob))
+                        last_time_type = 'time_registrationEnd'
+                    else:
+                        dict_time['time_registrationStart'].append((extract_time[0],label_prob))
+                        dict_time['time_registrationEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                elif entity.label==10 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_earnestMoneyEnd'
+                        else:
+                            dict_time['time_earnestMoneyStart'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_earnestMoneyStart'
+                    else:
+                        dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob))
+                        dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                elif entity.label==11 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        dict_time['time_earnestMoneyEnd'].append((extract_time[0], label_prob))
+                        last_time_type = 'time_earnestMoneyEnd'
+                    else:
+                        dict_time['time_earnestMoneyStart'].append((extract_time[0],label_prob))
+                        dict_time['time_earnestMoneyEnd'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                elif entity.label==13 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
+                            dict_time['time_completion'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_completion'
+                        else:
+                            dict_time['time_commencement'].append((extract_time[0], label_prob))
+                            last_time_type = 'time_commencement'
+                    else:
+                        dict_time['time_commencement'].append((extract_time[0],label_prob))
+                        dict_time['time_completion'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                elif entity.label==14 and label_prob>0.5:
+                    if len(extract_time)==1:
+                        dict_time['time_completion'].append((extract_time[0], label_prob))
+                        last_time_type = 'time_completion'
+                    else:
+                        dict_time['time_commencement'].append((extract_time[0],label_prob))
+                        dict_time['time_completion'].append((extract_time[1],label_prob))
+                        last_time_type = ''
+                else:
+                    last_time_type = ""
+            else:
+                last_time_type = ""
+        else:
+            last_time_type = ""
+        last_sentence_index = entity.sentence_index
+
+
+    result_dict = dict((key,"") for key in dict_time.keys())
+    for time_type,value in dict_time.items():
+        list_time = dict_time[time_type]
+        if list_time:
+            list_time.sort(key=lambda x:x[1],reverse=True)
+            result_dict[time_type] = list_time[0][0]
+    return result_dict
+
+def getOtherAttributes(list_entity):
+    dict_other = {"moneysource":"",
+                  "person_review":[],
+                  "serviceTime":"",
+                  "product":[],
+                  "total_tendereeMoney":0,
+                  "total_tendereeMoneyUnit":''}
+
+    for entity in list_entity:
+        if entity.entity_type == 'bidway':
+            dict_other["bidway"] = turnBidWay(entity.entity_text)
+        elif entity.entity_type=='moneysource':
+            dict_other["moneysource"] = entity.entity_text
+        elif entity.entity_type=='serviceTime':
+            if not entity.in_attachment:
+                dict_other["serviceTime"] = entity.entity_text
+            else:
+                if not dict_other["serviceTime"]:
+                    dict_other["serviceTime"] = entity.entity_text
+        elif entity.entity_type=="person" and entity.label ==4:
+            dict_other["person_review"].append(entity.entity_text)
+        elif entity.entity_type=='product':
+            dict_other["product"].append(entity.entity_text)
+        elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
+                dict_other["total_tendereeMoney"] = float(entity.entity_text)
+                dict_other["total_tendereeMoneyUnit"] = entity.money_unit
+
+    dict_other["product"] = list(set(dict_other["product"]))
+    return dict_other
+
+def getMoneyRange(RoleList):
+    pass
+
+def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
+    '''
+    @param:
+        list_sentence:所有文章的句子list
+        list_entity:所有文章的实体list
+    @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话  
+    '''
+    result = []
+    for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
+        RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
+        result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
+                           **getTimeAttributes(list_entity, list_sentence),
+                           **{"fingerprint": list_article.fingerprint,
+                              "match_enterprise": list_article.match_enterprise,
+                              "match_enterprise_type": list_article.match_enterprise_type,
+                              "process_time": getCurrent_date(),
+                              "attachmentTypes": list_article.attachmentTypes, "bidway": list_article.bidway}))
+        # result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
+        #                    **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
+        #                       "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
+        #                       "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
+    return result
+
+
+if __name__=="__main__":
+    '''
+    conn = getConnection()
+    cursor = conn.cursor()
+    #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
+    sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
+    
+    result = []
+    
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    count = 0
+    for row in rows:
+        
+        count += 1
+        # print(count)
+        doc_id = row[0]
+        
+        roleList = getPackageRoleMoney(doc_id)
+        result.append([doc_id,str(roleList),row[1]])
+        ''''''
+    with codecs.open("getAttribute.html","w",encoding="utf8") as f:
+        f.write('<html><head>\
+        <meta http-equiv="Content-Type"\
+        content="text/html; charset=UTF-8">\
+        </head>\
+        <body bgcolor="#FFFFFF">\
+        <table border="1">\
+        <tr>\
+        <td>doc_id</td>\
+        <td>角色</td>\
+        </tr>')
+        for item in result:
+            f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
+        f.write("</table></body>")
+    '''

La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 1517 - 10
BiddingKG/maxcompute/1.py


La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 894 - 25
BiddingKG/maxcompute/documentDumplicate.py


+ 51 - 17
BiddingKG/maxcompute/documentMerge.py

@@ -86,7 +86,10 @@ def getSet(list_dict,key):
                     _set.add(str(item[key]))
     return _set
 
-def split_with_time(list_dict,sort_key,timedelta=86400*120):
+def split_with_time(list_dict,sort_key,timedelta=86400*120,more_than_one=True):
+    group_num = 1
+    if more_than_one:
+        group_num = 2
     if len(list_dict)>0:
         if sort_key in list_dict[0]:
             list_dict.sort(key=lambda x:x[sort_key])
@@ -102,7 +105,7 @@ def split_with_time(list_dict,sort_key,timedelta=86400*120):
                     if len(_group)>1:
                         list_group.append(_group)
                     _begin = i + 1
-            if len(list_dict)>1:
+            if len(list_dict)>=group_num:
                 _group = []
                 for j in range(_begin,len(list_dict)):
                     _group.append(list_dict[j])
@@ -442,12 +445,6 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
         return _result
 
 
-
-
-
-
-
-
     def terminate(self, buffer):
         list_group = []
         the_group = buffer[0]
@@ -461,6 +458,7 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
         re_merge = False
         for _key in keys:
             if len(getSet(the_group,_key))>1:
+                log("has_more_than_one:%s"%str(getSet(the_group,_key)))
                 re_merge = True
                 break
         #判断是否相似而不相同
@@ -565,8 +563,7 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
                 #     final_group.append(list(set(_group["docid"])))
         else:
             final_group = [list(set([item["docid"] for item in the_group]))]
-        log(str(final_group))
-
+        log("%s--%s"%("final_group",str(final_group)))
 
         #每个channel选择一篇公告
         final_group_channel = []
@@ -586,16 +583,16 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
 
             #根据日期进行切分
             new_dict_channel_id = {}
-            print(dict_channel_id)
+            log("%s:%s"%("dict_channel_id",str(dict_channel_id)))
             for k,v in dict_channel_id.items():
-                list_time_docids = split_with_time(v,"page_time_stamp",86400*6)
-                print(list_time_docids)
+                list_time_docids = split_with_time(v,"page_time_stamp",86400*6,more_than_one=False)
+                log(list_time_docids)
                 for _l in list_time_docids:
                     list_t = self.splitByTimezone(_l,"json_dicttime")
                     for _t in list_t:
                         otherChannel += 1
                         new_dict_channel_id[otherChannel] = _t
-            print(new_dict_channel_id)
+            log("%s:%s"%("new_dict_channel_id",str(new_dict_channel_id)))
             channel_dict = {}
             for k,v in new_dict_channel_id.items():
                 v.sort(key=lambda x:x["docid"])
@@ -1231,11 +1228,48 @@ class f_encode_time(object):
 
         return _encode
 
+@annotate('string,string -> string,string')
+class f_decode_ruwei(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self, page_time,sub_docs_json):
+        if sub_docs_json is not None:
+            for sub_docs in json.loads(sub_docs_json):
+                if sub_docs.get("win_tenderer","")!="":
+                    self.forward(page_time,sub_docs.get("win_tenderer",""))
+                if sub_docs.get("second_tenderer","")!="":
+                    self.forward(page_time,sub_docs.get("second_tenderer",""))
+                if sub_docs.get("third_tenderer","")!="":
+                    self.forward(page_time,sub_docs.get("third_tenderer",""))
+
+
 if __name__ == '__main__':
     a = f_remege_limit_num_contain_bychannel()
     buffer = a.new_buffer()
-    a.iterate(buffer,1,1,86400*1,"1","1","1","1","1","1","1",5,5,None)
-    a.iterate(buffer,3,1,86400*4,"1","1","1","1","1","1","1",5,5,'{"a":"dbb"}')
-    a.iterate(buffer,5,1,86400*10,"1","1","1","1","1","1","1",5,5,"{}")
+    tmp_s = '''
+    234858920	229011768	2022-03-25	1648137600		横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工招标文件.pdf	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工文件.pdf	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司				103	0	7	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "2022-04-29", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	232745950	2022-04-12	1649692800	E4404000001002779001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工招标答疑	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工答疑	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司				103	0	8	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	234858920	2022-04-21	1650470400	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工						101	1	2	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	234595980	2022-04-20	1650384000	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司				105	0	10	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-04-22", "time_publicity_start": "2022-04-21", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	228908786	2022-03-25	1648137600	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司			1795743.68	52	0	8	"{"time_bidclose": "2022-04-20", "time_bidopen": "2022-04-20", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "2022-04-20", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "2022-03-26", "time_publicity_end": "2022-04-26", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	234523333	2022-04-20	1650384000	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工						101	0	2	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	234787082	2022-04-20	1650384000	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工开标记录表	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工开标记录表					1795743.68	101	0	6	"{"time_bidclose": "", "time_bidopen": "2022-04-20", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	235240618	2022-04-22	1650556800	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工			广东博思信息技术股份有限公司	1775136.23		101	0	12	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-04-26", "time_publicity_start": "2022-04-24", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+
+    '''
+    for _s in tmp_s.split("\n"):
+        ls = _s.split("\t")
+        if len(ls)!=17:
+            continue
+        _confid = 1 if ls[14] =="" else ls[14]
+        a.iterate(buffer,ls[1],ls[13],int(ls[3]),ls[8],ls[10],ls[11],ls[12],ls[7],ls[5],ls[4],_confid,ls[15],ls[16][1:-1])
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-22", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
     print(a.terminate(buffer))
     print(1)

+ 46 - 1
BiddingKG/maxcompute/enterpriseFix.py

@@ -1,5 +1,9 @@
 #coding:utf8
 from odps.udf import annotate,BaseUDAF,BaseUDTF
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import json
+import traceback
 
 @annotate('string->string')
 class getYearMonth(object):
@@ -127,4 +131,45 @@ class f_turn_circle(object):
         if name is not None:
             return name.replace("(","(").replace(")",")")
         else:
-            return ""
+            return ""
+
+@annotate('string,string->string,bigint')
+class f_dumplicate_contacts(BaseUDTF):
+
+    def __init__(self):
+        pass
+
+    def process(self,name,contacts):
+        if contacts is None:
+            self.forward(contacts,1)
+            return
+        try:
+            list_contacts = json.loads(contacts)
+            _set = set()
+            _phone_set = set()
+            new_list_contacts = []
+            list_contacts.sort(key=lambda x:len(x.get("contact_person","")),reverse=True)
+            for _conta in list_contacts:
+                contact_person = _conta.get("contact_person","")
+                mobile_no = _conta.get("mobile_no","")
+                phone_no = _conta.get("phone_no","")
+                if contact_person=="" and (mobile_no in _phone_set or phone_no in _phone_set):
+                    continue
+                _key = "%s-%s-%s"%(contact_person,mobile_no,phone_no)
+                if _key in _set:
+                    continue
+                if mobile_no!="":
+                    _phone_set.add(mobile_no)
+                if phone_no!="":
+                    _phone_set.add(phone_no)
+                new_list_contacts.append(_conta)
+                _set.add(_key)
+            if len(new_list_contacts)!=len(list_contacts):
+                logging.info(name)
+            new_list_contacts.sort(key=lambda x:x.get("level",0),reverse=True)
+            self.forward(json.dumps(new_list_contacts,ensure_ascii=False),1)
+        except Exception as e:
+            traceback.print_exc()
+            logging.info(contacts)
+            self.forward(None,0)
+

+ 2 - 2
BiddingKG/maxcompute/evaluates.py

@@ -76,8 +76,8 @@ def multiLoadEnv():
         # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
         #改为zip引入
         log("=======")
-        include_package_path("BiddingKG.baseline.zip")
-        # include_package_path("BiddingKG.backup.zip")
+        # include_package_path("BiddingKG.baseline.zip")
+        include_package_path("BiddingKG.backup.zip")
         logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
 
     def load_vector():

+ 113 - 27
BiddingKG/maxcompute/proposedBuildingProject.py

@@ -10,17 +10,23 @@ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(le
 import time
 import uuid
 import re
+import traceback
+from multiprocessing import Process,Queue
 
 
+def log(msg):
+    logging.info(msg)
+
 # 配置pandas依赖包
 def include_package_path(res_name):
     import os, sys
     archive_files = get_cache_archive(res_name)
     dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
                         if '.dist_info' not in f.name], key=lambda v: len(v))
-    sys.path.append(dir_names[0])
-
-    return os.path.dirname(dir_names[0])
+    _path = dir_names[0].split(".zip/files")[0]+".zip/files"
+    log("add path:%s"%(_path))
+    sys.path.append(_path)
+    return _path
 
 # 可能出现类似RuntimeError: xxx has been blocked by sandbox
 # 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
@@ -66,8 +72,7 @@ def init_env(list_files,package_name):
 def multiLoadEnv():
     def load_project():
         start_time = time.time()
-        # init_env(["BiddingKG.zip.env.line"],str(uuid.uuid4()))
-        init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
+        include_package_path("BiddingKG.backup.zip")
         logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
 
     def load_vector():
@@ -204,33 +209,114 @@ class extract_proposedBuilding(BaseUDTF):
         import pandas as pd
         global pd
         self._pattern = getPattern()
-        import BiddingKG.dl.interface.Preprocessing as Preprocessing
-        from BiddingKG.dl.common.Utils import spanWindow,timeFormat
 
-        global Preprocessing,spanWindow,timeFormat
+        self.task_queue = Queue()
+        self.result_queue = Queue()
+        self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+        self.deal_process.start()
+        import numpy as np
+        self.last_timeout = False
 
+    def f_queue_process(self,task_queue,result_queue):
+        log("start import predict function")
+        import BiddingKG.dl.interface.Preprocessing as Preprocessing
+        from BiddingKG.dl.common.Utils import spanWindow,timeFormat
 
-    def process(self, doc_id,dochtmlcon,doctitle,project_name):
-        _stage = extract_legal_stage(doctitle)
-        if _stage is not None:
-            list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,dochtmlcon,"","",doctitle]],useselffool=True)
-            for list_article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
-                content = list_article.content
+        global spanWindow,timeFormat
+        log("import done")
+        while True:
+            try:
+                item = task_queue.get(True,timeout=10)
+
+                doc_id = item.get("docid","")
+                dochtmlcon = item.get("dochtmlcon","")
+                doctitle = item.get("doctitle","")
+                project_name = item.get("project_name","")
+                log("start process docid:%s"%(str(doc_id)))
                 _stage = extract_legal_stage(doctitle)
-                if _stage is None:
-                    continue
-                _industry = extract_industry(content,self._pattern)
-                if _industry is None:
-                    continue
-                _proportion = extract_proportion(content)
-                _projectDigest = extract_projectDigest(content)
-                _projectAddress = extract_projectAddress(list_sentence,list_entity)
-                _begin_time,_end_time = extract_begin_end_time(list_sentence,list_entity)
-                project_name_refind = ""
-                if project_name is not None and len(project_name)>0:
-                    project_name_refind = re.sub("设计|环评|监理|施工","",project_name)
+                result_json = None
                 if _stage is not None:
-                    self.forward(_stage,_proportion,_projectDigest,_projectAddress,_begin_time,_end_time,project_name_refind,_industry)
+                    list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,dochtmlcon,"","",doctitle,"",""]],useselffool=True)
+                    for list_article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
+                        content = list_article.content
+                        _stage = extract_legal_stage(doctitle)
+                        if _stage is None:
+                            continue
+                        _industry = extract_industry(content,self._pattern)
+                        if _industry is None:
+                            continue
+                        _proportion = extract_proportion(content)
+                        _projectDigest = extract_projectDigest(content)
+                        _projectAddress = extract_projectAddress(list_sentence,list_entity)
+                        _begin_time,_end_time = extract_begin_end_time(list_sentence,list_entity)
+                        project_name_refind = ""
+                        if project_name is not None and len(project_name)>0:
+                            project_name_refind = re.sub("设计|环评|监理|施工","",project_name)
+                        if _stage is not None:
+                            result_json = {"_stage":_stage,
+                                           "_proportion":_proportion,
+                                           "_projectAddress":_projectAddress,
+                                           "_projectDigest":_projectDigest,
+                                           "_begin_time":_begin_time,
+                                           "_end_time":_end_time,
+                                           "project_name_refind":project_name_refind,
+                                           "_industry":_industry}
+
+                result_queue.put(result_json,True)
+                log("end process docid:%s"%(str(doc_id)))
+            except Exception as e:
+                traceback.print_exc()
+                log("get data time out")
+                pass
+
+    def process(self,doc_id,dochtmlcon,doctitle,project_name):
+        # #直接处理
+        # if content is not None and _doc_id not in [105677700,126694044,126795572,126951461,71708072,137850637]:
+        #     result_json = predict(str(_doc_id),content,str(_title))
+        #     self.forward(page_time,int(_doc_id),result_json)
+
+
+        if dochtmlcon is not None and doc_id not in [105677700,126694044,126795572,126951461,71708072,137850637]:
+            #清除队列中的数据
+            try:
+                while(self.task_queue.qsize()>0):
+                    self.task_queue.get(timeout=5)
+            except Exception as e:
+                pass
+            try:
+                while(self.result_queue.qsize()>0):
+                    self.result_queue.get(timeout=5)
+            except Exception as e:
+                pass
+
+            _item = {"docid":doc_id,"dochtmlcon":dochtmlcon,"doctitle":doctitle,"project_name":project_name}
+
+
+            try:
+                _timeout = 60*4
+                if self.last_timeout:
+                    _timeout += 60*5
+                    self.last_timeout = False
+                if not self.deal_process.is_alive():
+                    log("deal process is down")
+                    self.task_queue = Queue()
+                    self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+                    self.deal_process.start()
+                    _timeout += 60*5
+                log("putting item to task_queue with docid:%s"%(str(doc_id)))
+                self.task_queue.put(_item)
+                result_json = self.result_queue.get(timeout=_timeout)
+                if result_json is not None:
+                    self.forward(result_json.get("_stage"),result_json.get("_proportion"),result_json.get("_projectDigest"),result_json.get("_projectAddress"),result_json.get("_begin_time"),result_json.get("_end_time"),result_json.get("project_name_refind"),result_json.get("_industry"))
+            except Exception as e:
+                log("dealing docid %s failed by timeout"%(str(doc_id)))
+                self.last_timeout = True
+                self.deal_process.kill()
+                time.sleep(5)
+                self.task_queue = Queue()
+                self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+                self.deal_process.start()
+
 
 
 @annotate('bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string')

+ 386 - 0
BiddingKG/maxcompute/去重规则.md

@@ -0,0 +1,386 @@
+
+
+--新增规则
+根据公告附件进行去重
+
+--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),1 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,doctitle_refine,win_tenderer,win_bid_price
+having doctitle_refine!="" and doctitle_refine is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),2 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,project_name,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+--中标公告 编号 标题 中标人 中标价 站源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,tenderee),3 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,doctitle_refine,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and win_tenderer!="" and win_bid_price=""
+and count(1)>1;
+
+--招标 编号 标题 招标人 预算 站源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),4 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,doctitle_refine,tenderee,bidding_budget
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 同一个招标人同一天采购同一样物品的时候,这个规则就不适用了
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),5 from run_dumplicate_document_his
+-- where docchannel='52'
+-- group by project_name,tenderee,bidding_budget
+-- having project_name!="" and project_name is not NULL 
+-- and tenderee!="" and tenderee is not NULL 
+-- and bidding_budget!="";
+
+--招标公告 编号 名称 预算 站源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),5 from run_dumplicate_document_his
+where docchannel not in (101,118,119,120)
+group by docchannel,project_code,project_name,bidding_budget
+having project_name!="" and project_name is not NULL 
+and project_code!="" and project_code is not NULL 
+and bidding_budget!=""
+and count(1)>1;
+
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),6 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,project_name,agency,bidding_budget
+having project_name!="" and project_name is not NULL 
+and agency!="" and agency is not NULL
+and count(1)>1;
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),7 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,project_code,agency,bidding_budget
+having project_code!="" and project_code is not NULL 
+and agency!="" and agency is not NULL 
+and count(1)>1;
+
+-- 7. 非中标公告 - 同项目名称 - 同发布日期 - 同招标人 - 同预算 -  同类型 - 信息源>1 - 同项目编号
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),8 from run_dumplicate_document_his
+where docchannel not in (101,119,120)
+group by docchannel,project_name,page_time_stamp,tenderee,bidding_budget,project_code
+having project_name!="" and project_name is not NULL 
+and page_time_stamp>0 and tenderee!="" and tenderee is not NULL 
+and bidding_budget!="" and project_code!="" and project_code is not NULL
+and count(1)>1;
+
+-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0)
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,tenderee),9 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,project_name,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and win_tenderer!="" and win_bid_price=""
+and count(1)>1;
+
+-- 8. 中标公告 - 同项目名称 - 同发布日期 - 同中标人 - 同中标价 -  同类型 - 信息源>1 - 同项目编号
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),10 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_name,page_time_stamp,win_tenderer,win_bid_price,project_code
+having project_name!="" and project_name is not NULL 
+and page_time_stamp>0 and win_tenderer!="" 
+and win_bid_price!="" and project_code!="" and project_code is not NULL
+and count(1)>1;
+
+-- -- 6. 不同公告类型 - 同原标题- 同日期
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid(docid,page_time_stamp,extract_count,docchannel,2,tenderee),11 from run_dumplicate_document_his
+-- group by doctitle,page_time_stamp
+-- having doctitle!="" and doctitle is not NULL 
+-- and page_time_stamp>0
+-- and count(1)>1;
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),12 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,doctitle_refine,tenderee,bidding_budget
+having doctitle_refine!="" and doctitle_refine is not NULL 
+and tenderee!="" and tenderee is not NULL
+and count(1)>1;
+
+-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0)
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),13 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,doctitle_refine,agency,bidding_budget
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 公告内容完全相同的去重
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,1),0 from run_dumplicate_document_his
+group by fingerprint
+having length(fingerprint)>0
+and count(1)>1;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,tenderee,agency,1,doctitle_refine),35 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_bid_price,bidding_budget
+-- having length(win_bid_price)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,tenderee,1,doctitle_refine),36 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,agency,bidding_budget
+-- having length(agency)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,bidding_budget,agency,1,doctitle_refine),37 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_tenderer,win_bid_price
+-- having length(win_tenderer)>0
+-- and length(win_bid_price)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,win_bid_price,bidding_budget,1,doctitle_refine),38 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_tenderer,agency
+-- having length(win_tenderer)>0
+-- and length(agency)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,tenderee,bidding_budget,1,doctitle_refine),39 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_bid_price,agency
+-- having length(win_bid_price)>0
+-- and length(agency)>0
+-- and count(1)>1
+-- ;
+
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),14 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,project_code,tenderee,bidding_budget
+having project_code!="" and project_code is not NULL 
+and tenderee!="" and tenderee is not NULL
+and count(1)>1;
+
+-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),15 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,doctitle_refine,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),16 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),17 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_name,win_tenderer,win_bid_price
+having project_name!="" and project_name is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),18 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,doctitle_refine,agency,bidding_budget
+having doctitle_refine!="" and doctitle_refine is not NULL 
+and agency!="" and agency is not NULL
+and count(1)>1;
+
+-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),19 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,project_name,agency,bidding_budget
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),20 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,project_name,tenderee,bidding_budget
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),21 from run_dumplicate_document_his
+group by docchannel,doctitle_refine,tenderee,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and doctitle_refine!=""
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),22 from run_dumplicate_document_his
+group by docchannel,project_code,tenderee,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_code!="" and project_code is not NULL
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),23 from run_dumplicate_document_his
+group by docchannel,project_name,tenderee,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_name!="" and project_name is not NULL
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),24 from run_dumplicate_document_his
+group by docchannel,doctitle_refine,agency,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and doctitle_refine!=""
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),25 from run_dumplicate_document_his
+group by docchannel,project_code,agency,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_code!="" and project_code is not NULL
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),26 from run_dumplicate_document_his
+group by docchannel,project_name,agency,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_name!="" and project_name is not NULL
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,agency,1,doctitle_refine),30 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,bidding_budget
+-- having length(tenderee)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,bidding_budget,win_bid_price,agency,1,doctitle_refine),31 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,win_tenderer
+-- having length(tenderee)>0
+-- and length(win_tenderer)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,bidding_budget,agency,1,doctitle_refine),32 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,win_bid_price
+-- having length(tenderee)>0
+-- and length(win_bid_price)>0
+-- and count(1)>1
+-- ;
+
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,bidding_budget,1,doctitle_refine),33 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,agency
+-- having length(tenderee)>0
+-- and length(agency)>0
+-- and count(1)>1
+-- ;
+
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,win_bid_price,agency,1,doctitle_refine),34 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_tenderer,bidding_budget
+-- having length(win_tenderer)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+--标题和类型相同的公告分为 编号 预算 中标人 中标价 代理都为空 及其它 两组 对这两组的数据进行匹配 规则是招标人相同且站源不同
+insert into document_group_his(json_set_docid,rule_id)
+select F_SET_DOCID_BINARYCHART(docid,page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no),0 
+from run_dumplicate_document_his
+where 1=1
+group by doctitle_refine,docchannel
+having length(doctitle_refine)>7 and count(1)>1;

+ 6 - 0
BiddingKG/maxcompute/重跑历史数据.md

@@ -0,0 +1,6 @@
+
+
+重跑历史数据需要注意的事项
+1. 对要素提取的公司进行清理
+2. 对联系方式进行清理
+3. 对重复公告写入doctextcon的辅助搜索的数据进行清理

+ 3438 - 0
BiddingKG/predictor.py

@@ -0,0 +1,3438 @@
+'''
+Created on 2018年12月26日
+
+@author: User
+'''
+
+import os
+import sys
+from BiddingKG.dl.common.nerUtils import *
+sys.path.append(os.path.abspath("../.."))
+# from keras.engine import topology
+# from keras import models
+# from keras import layers
+# from keras_contrib.layers.crf import CRF
+# from keras.preprocessing.sequence import pad_sequences
+# from keras import optimizers,losses,metrics
+from BiddingKG.dl.common.Utils import *
+from BiddingKG.dl.interface.modelFactory import *
+import tensorflow as tf
+from BiddingKG.dl.product.data_util import decode, process_data
+from BiddingKG.dl.interface.Entitys import Entity
+from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
+from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
+from bs4 import BeautifulSoup
+import copy
+import calendar
+import datetime
+
+from threading import RLock
+dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
+              "prem":{"predictor":None,"Lock":RLock()},
+              "epc":{"predictor":None,"Lock":RLock()},
+              "roleRule":{"predictor":None,"Lock":RLock()},
+              "roleRuleFinal":{"predictor":None,"Lock":RLock()},
+              "tendereeRuleRecall":{"predictor":None,"Lock":RLock()},
+                  "form":{"predictor":None,"Lock":RLock()},
+                  "time":{"predictor":None,"Lock":RLock()},
+                  "punish":{"predictor":None,"Lock":RLock()},
+                  "product":{"predictor":None,"Lock":RLock()},
+                "product_attrs":{"predictor":None,"Lock":RLock()},
+                  "channel": {"predictor": None, "Lock": RLock()},
+                  "deposit_payment_way": {"predictor": None, "Lock": RLock()},
+                  "total_unit_money": {"predictor": None, "Lock": RLock()}
+                  }
+
+
+def getPredictor(_type):
+    if _type in dict_predictor:
+        with dict_predictor[_type]["Lock"]:
+            if dict_predictor[_type]["predictor"] is None:
+                if _type == "codeName":
+                    dict_predictor[_type]["predictor"] = CodeNamePredict()
+                if _type == "prem":
+                    dict_predictor[_type]["predictor"] = PREMPredict()
+                if _type == "epc":
+                    dict_predictor[_type]["predictor"] = EPCPredict()
+                if _type == "roleRule":
+                    dict_predictor[_type]["predictor"] = RoleRulePredictor()
+                if _type == "roleRuleFinal":
+                    dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
+                if _type == "tendereeRuleRecall":
+                    dict_predictor[_type]["predictor"] = TendereeRuleRecall()
+                if _type == "form":
+                    dict_predictor[_type]["predictor"] = FormPredictor()
+                if _type == "time":
+                    dict_predictor[_type]["predictor"] = TimePredictor()
+                if _type == "punish":
+                    dict_predictor[_type]["predictor"] = Punish_Extract()
+                if _type == "product":
+                    dict_predictor[_type]["predictor"] = ProductPredictor()
+                if _type == "product_attrs":
+                    dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
+                if _type == "channel":
+                    dict_predictor[_type]["predictor"] = DocChannel()
+                if _type == 'deposit_payment_way':
+                    dict_predictor[_type]["predictor"] = DepositPaymentWay()
+                if _type == 'total_unit_money':
+                    dict_predictor[_type]["predictor"] = TotalUnitMoney()
+            return dict_predictor[_type]["predictor"]
+    raise NameError("no this type of predictor")
+
+
+# 编号名称模型
+class CodeNamePredict():
+    
+    def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
+        
+        self.model = None
+        self.MAX_LEN = None
+        self.model_code = None
+        if EMBED_DIM is None:
+            self.EMBED_DIM = 60
+        else:
+            self.EMBED_DIM = EMBED_DIM
+        if BiRNN_UNITS is None:
+            self.BiRNN_UNITS = 200
+        else:
+            self.BiRNN_UNITS = BiRNN_UNITS
+        self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
+        #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5"
+        self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5"
+        vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk"
+        classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk"
+        self.vocab = load(vocabpath)
+        self.class_labels = load(classlabelspath)
+        
+        #生成提取编号和名称的正则
+        id_PC_B = self.class_labels.index("PC_B")
+        id_PC_M = self.class_labels.index("PC_M")
+        id_PC_E = self.class_labels.index("PC_E")
+        id_PN_B = self.class_labels.index("PN_B")
+        id_PN_M = self.class_labels.index("PN_M")
+        id_PN_E = self.class_labels.index("PN_E")
+        self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E))
+        self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E))
+        # print("pc",self.PC_pattern)
+        # print("pn",self.PN_pattern)
+        self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
+        
+        self.inputs = None
+        self.outputs = None
+        self.sess_codename = tf.Session(graph=tf.Graph())
+        self.sess_codesplit = tf.Session(graph=tf.Graph())
+        self.inputs_code = None
+        self.outputs_code = None
+        if not lazyLoad:
+            self.getModel()
+            self.getModel_code()
+        
+        
+        
+    def getModel(self):
+        '''
+        @summary: 取得编号和名称模型
+        '''
+        if self.inputs is None:
+            log("get model of codename")
+            with self.sess_codename.as_default():
+                with self.sess_codename.graph.as_default():
+                    meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf")
+                    signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+                    signature_def = meta_graph_def.signature_def
+                    self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
+                    self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name)
+                    self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name)
+                    self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name)
+                    self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name)
+
+                return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
+        else:
+            return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
+        '''    
+        if self.model is None:
+            self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
+            self.model.load_weights(self.filepath)
+        return self.model
+        '''
+    
+    def getModel_code(self):
+        if self.inputs_code is None:
+            log("get model of code")
+            with self.sess_codesplit.as_default():
+                with self.sess_codesplit.graph.as_default():
+                    meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel")
+                    signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+                    signature_def = meta_graph_def.signature_def
+                    self.inputs_code = []
+                    self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
+                    self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
+                    self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name))
+                    self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
+                    self.sess_codesplit.graph.finalize()
+                    return self.inputs_code,self.outputs_code
+        else:
+            return self.inputs_code,self.outputs_code
+        '''
+        if self.model_code is None:
+            log("get model of model_code")
+            with self.sess_codesplit.as_default():
+                with self.sess_codesplit.graph.as_default():
+                    self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
+        return self.model_code
+        '''
+    
+    def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
+        '''
+        model = models.Sequential()
+        model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True))  # Random embedding
+        model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
+        crf = CRF(len(chunk_tags), sparse_target=True)
+        model.add(crf)
+        model.summary()
+        model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
+        return model
+        '''
+        input = layers.Input(shape=(None,))
+        if weights is not None:
+            embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
+        else:
+            embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
+        bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
+        bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
+        crf = CRF(len(chunk_tags),sparse_target=True)
+        crf_out = crf(bilstm_dense)
+        model = models.Model(input=[input],output = [crf_out])
+        model.summary()
+        model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
+        return model
+    
+    #根据规则补全编号或名称两边的符号
+    def fitDataByRule(self,data):
+        symbol_dict = {"(":")",
+                       "(":")",
+                       "[":"]",
+                       "【":"】",
+                       ")":"(",
+                       ")":"(",
+                       "]":"[",
+                       "】":"【"}
+        leftSymbol_pattern = re.compile("[\((\[【]")
+        rightSymbol_pattern = re.compile("[\))\]】]")
+        leftfinds = re.findall(leftSymbol_pattern,data)
+        rightfinds = re.findall(rightSymbol_pattern,data)
+        result = data
+        if len(leftfinds)+len(rightfinds)==0:
+            return data
+        elif len(leftfinds)==len(rightfinds):
+            return data
+        elif abs(len(leftfinds)-len(rightfinds))==1:
+            if len(leftfinds)>len(rightfinds):
+                if symbol_dict.get(data[0]) is not None:
+                    result = data[1:]
+                else:
+                    #print(symbol_dict.get(leftfinds[0]))
+                    result = data+symbol_dict.get(leftfinds[0])
+            else:
+                if symbol_dict.get(data[-1]) is not None:
+                    result = data[:-1]
+                else:
+                    result = symbol_dict.get(rightfinds[0])+data
+        return  result
+
+    def decode(self,logits, trans, sequence_lengths, tag_num):
+        viterbi_sequences = []
+        for logit, length in zip(logits, sequence_lengths):
+            score = logit[:length]
+            viterbi_seq, viterbi_score = viterbi_decode(score, trans)
+            viterbi_sequences.append(viterbi_seq)
+        return viterbi_sequences
+    
+    def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
+        #@summary: 获取每篇文章的code和name
+        pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
+
+        result = []
+        index_unk = self.word2index.get("<unk>")
+        # index_pad = self.word2index.get("<pad>")
+        if list_entitys is None:
+            list_entitys = [[] for _ in range(len(list_sentences))]
+        for list_sentence,list_entity in zip(list_sentences,list_entitys):
+            if len(list_sentence)==0:
+                result.append([{"code":[],"name":""}])
+                continue
+            doc_id = list_sentence[0].doc_id
+            # sentences = []
+            # for sentence in list_sentence:
+            #     if len(sentence.sentence_text)>MAX_AREA:
+            #         for _sentence_comma in re.split("[;;,\n]",sentence):
+            #             _comma_index = 0
+            #             while(_comma_index<len(_sentence_comma)):
+            #                 sentences.append(_sentence_comma[_comma_index:_comma_index+MAX_AREA])
+            #                 _comma_index += MAX_AREA
+            #     else:
+            #         sentences.append(sentence+"。")
+            list_sentence.sort(key=lambda x:len(x.sentence_text),reverse=True)
+            _begin_index = 0
+            
+            item = {"code":[],"name":""}
+            code_set = set()
+            dict_name_freq_score = dict()
+            while(True):
+                MAX_LEN = len(list_sentence[_begin_index].sentence_text)
+                if MAX_LEN>MAX_AREA:
+                    MAX_LEN = MAX_AREA
+                _LEN = MAX_AREA//MAX_LEN
+                #预测
+
+                x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
+                # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
+                x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
+                x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
+
+                if USE_API:
+                    requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},verify=True)
+                    predict_y = json.loads(requests_result.text)['result']
+                    # print("cost_time:", json.loads(requests_result.text)['cost_time'])
+                    # print(MAX_LEN,_LEN,_begin_index)
+                else:
+                    with self.sess_codename.as_default():
+                        t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
+                        _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x,
+                                                                                              t_input_length:x_len,
+                                                                                              t_keepprob:1.0})
+                        predict_y = self.decode(_logits,_trans,x_len,7)
+                        # print('==========',_logits)
+
+                        '''
+                        for item11 in np.argmax(predict_y,-1):
+                            print(item11)
+                        print(predict_y)
+                        '''
+                # print(predict_y)
+                for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)):
+                    pad_sentence = sentence.sentence_text[:MAX_LEN]
+                    join_predict = "".join([str(s) for s in predict])
+                    # print(pad_sentence)
+                    # print(join_predict)
+                    code_x = []
+                    code_text = []
+                    temp_entitys = []
+                    for iter in re.finditer(self.PC_pattern,join_predict):
+                        get_len = 40
+                        if iter.span()[0]<get_len:
+                            begin = 0
+                        else:
+                            begin = iter.span()[0]-get_len
+                        end = iter.span()[1]+get_len
+                        code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
+                        code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""))
+                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
+                        temp_entitys.append(_entity)
+                    #print("code",code_text)
+                    if len(code_x)>0:
+                        code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3))
+                        if USE_PAI_EAS:
+                            request = tf_predict_pb2.PredictRequest()
+                            request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
+                            request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0]))
+                            request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1))
+                            request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
+                            request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1]))
+                            request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1))
+                            request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
+                            request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2]))
+                            request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1))
+                            request_data = request.SerializeToString()
+                            list_outputs = ["outputs"]
+                            _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs)
+                            if _result is not None:
+                                predict_code = _result["outputs"]
+                            else:
+                                with self.sess_codesplit.as_default():
+                                    with self.sess_codesplit.graph.as_default():
+                                        predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
+                        else:
+                            with self.sess_codesplit.as_default():
+                                with self.sess_codesplit.graph.as_default():
+                                    inputs_code,outputs_code = self.getModel_code()
+                                    predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]},MAX_BATCH=2)[0]
+
+                                    #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
+                                    #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
+                        for h in range(len(predict_code)):
+                            if predict_code[h][0]>0.5:
+                                the_code = self.fitDataByRule(code_text[h])
+
+                                #add code to entitys
+                                list_entity.append(temp_entitys[h])
+
+                                if the_code not in code_set:
+                                    code_set.add(the_code)
+                                    item['code'] = list(code_set)
+                    for iter in re.finditer(self.PN_pattern,join_predict):
+                        _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
+
+                        #add name to entitys
+                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
+                        list_entity.append(_entity)
+                        w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
+                        if _name not in dict_name_freq_score:
+                            # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
+                        else:
+                            dict_name_freq_score[_name][0] += 1
+                    '''
+                    for iter in re.finditer(self.PN_pattern,join_predict):
+                        print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
+                    if item[1]['name']=="":
+                        for iter in re.finditer(self.PN_pattern,join_predict):
+                            #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
+                            item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
+                            break
+                    '''
+                if _begin_index+_LEN>=len(list_sentence):
+                    break
+                _begin_index += _LEN
+            
+            list_name_freq_score = []
+
+            # 2020/11/23 大网站规则调整
+            if len(dict_name_freq_score) == 0:
+                name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
+                for sentence in list_sentence:
+                    # pad_sentence = sentence.sentence_text
+                    othername = re.search(name_re1, sentence.sentence_text)
+                    if othername != None:
+                        project_name = othername.group(3)
+                        beg = find_index([project_name], sentence.sentence_text)[0]
+                        end = beg + len(project_name)
+                        _name = self.fitDataByRule(sentence.sentence_text[beg:end])
+                        # add name to entitys
+                        _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
+                        sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
+                                         entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
+                                         end_index=0, wordOffset_begin=beg, wordOffset_end=end,in_attachment=sentence.in_attachment)
+                        list_entity.append(_entity)
+                        w = 1
+                        if _name not in dict_name_freq_score:
+                            # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
+                        else:
+                            dict_name_freq_score[_name][0] += 1
+                # othername = re.search(name_re1, sentence.sentence_text)
+                # if othername != None:
+                #     _name = othername.group(3)
+                #     if _name not in dict_name_freq_score:
+                #         dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1]
+                #     else:
+                #         dict_name_freq_score[_name][0] += 1
+
+            for _name in dict_name_freq_score.keys():
+                list_name_freq_score.append([_name,dict_name_freq_score[_name]])
+            # print(list_name_freq_score)
+            if len(list_name_freq_score)>0:
+                list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
+                item['name'] = list_name_freq_score[0][0]
+                # if list_name_freq_score[0][1][0]>1:
+                #     item[1]['name'] = list_name_freq_score[0][0]
+                # else:
+                #     list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True)
+                #     item[1]["name"] = list_name_freq_score[0][0]
+                
+            #下面代码加上去用正则添加某些识别不到的项目编号
+            if item['code'] == []:
+                for sentence in list_sentence:
+                    # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text)
+                    # if othercode != None:
+                    #     item[1]['code'].append(othercode.group(2))
+                    # 2020/11/23 大网站规则调整
+                    othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
+                    if othercode != None:
+                        item['code'].append(othercode.group(3))
+            item['code'].sort(key=lambda x:len(x),reverse=True)
+            result.append(item)
+
+            list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
+        return result
+                        
+                
+    '''
+    #当数据量过大时会报错
+    def predict(self,articles,MAX_LEN = None):
+        sentences = []
+        for article in articles:
+            for sentence in article.content.split("。"):
+                sentences.append([sentence,article.id])
+        if MAX_LEN is None:
+            sent_len = [len(sentence[0]) for sentence in sentences]
+            MAX_LEN = max(sent_len)
+            #print(MAX_LEN)
+           
+        #若为空,则直接返回空
+        result = [] 
+        if MAX_LEN==0:
+            for article in articles:
+                result.append([article.id,{"code":[],"name":""}])
+            return result
+        
+        index_unk = self.word2index.get("<unk>")
+        index_pad = self.word2index.get("<pad>")
+        
+        x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences]
+        x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
+        
+        predict_y = self.getModel().predict(x)
+        
+        
+        last_doc_id = ""
+        item = []
+        for sentence,predict in zip(sentences,np.argmax(predict_y,-1)):
+            pad_sentence = sentence[0][:MAX_LEN]
+            doc_id = sentence[1]
+            join_predict = "".join([str(s) for s in predict])
+            if doc_id!=last_doc_id:
+                if last_doc_id!="":
+                    result.append(item)
+                item = [doc_id,{"code":[],"name":""}]
+                code_set = set()
+            code_x = []
+            code_text = []
+            for iter in re.finditer(self.PC_pattern,join_predict):
+                get_len = 40
+                if iter.span()[0]<get_len:
+                    begin = 0
+                else:
+                    begin = iter.span()[0]-get_len
+                end = iter.span()[1]+get_len
+                code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
+                code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
+            if len(code_x)>0:
+                code_x = np.transpose(np.array(code_x),(1,0,2,3))
+                predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
+                for h in range(len(predict_code)):
+                    if predict_code[h][0]>0.5:
+                        the_code = self.fitDataByRule(code_text[h])
+                        if the_code not in code_set:
+                            code_set.add(the_code)
+                            item[1]['code'] = list(code_set)
+            if item[1]['name']=="":
+                for iter in re.finditer(self.PN_pattern,join_predict):
+                    #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
+                    item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
+                    break
+                
+            last_doc_id = doc_id
+        result.append(item)
+        return result
+    '''
+        
+#角色金额模型        
+class PREMPredict():
+
+    
+    def __init__(self):
+        #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
+        self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
+        self.model_role = Model_role_classify_word()
+        self.model_money = Model_money_classify()
+        
+        return
+    
+    def search_role_data(self,list_sentences,list_entitys):
+        '''
+        @summary:根据句子list和实体list查询角色模型的输入数据
+        @param:
+            list_sentences:文章的sentences
+            list_entitys:文章的entitys
+        @return:角色模型的输入数据
+        '''
+        text_list = []
+        data_x = []
+        points_entitys = []
+        for list_entity,list_sentence in zip(list_entitys,list_sentences):
+
+            list_entity.sort(key=lambda x:x.sentence_index)
+            list_sentence.sort(key=lambda x:x.sentence_index)
+            p_entitys = 0
+            p_sentences = 0
+            while(p_entitys<len(list_entity)):
+                entity = list_entity[p_entitys]
+                if entity.entity_type in ['org','company']:
+                    while(p_sentences<len(list_sentence)):
+                        sentence = list_sentence[p_sentences]
+                        if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
+                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-10):entity.wordOffset_end+10])
+                            #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
+                            item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
+                            data_x.append(item_x)
+                            points_entitys.append(entity)
+                            break
+                        p_sentences += 1
+                    
+                p_entitys += 1
+            
+        if len(points_entitys)==0:
+            return None
+        
+        return [data_x,points_entitys, text_list]
+    
+    
+    def search_money_data(self,list_sentences,list_entitys):
+        '''
+        @summary:根据句子list和实体list查询金额模型的输入数据
+        @param:
+            list_sentences:文章的sentences
+            list_entitys:文章的entitys
+        @return:金额模型的输入数据
+        '''
+        text_list = []
+        data_x = []
+        points_entitys = []
+        for list_entity,list_sentence in zip(list_entitys,list_sentences):
+
+            list_entity.sort(key=lambda x:x.sentence_index)
+            list_sentence.sort(key=lambda x:x.sentence_index)
+            p_entitys = 0
+    
+            while(p_entitys<len(list_entity)):
+                entity = list_entity[p_entitys]
+                if entity.entity_type=="money":
+                    p_sentences = 0
+                    while(p_sentences<len(list_sentence)):
+                        sentence = list_sentence[p_sentences]
+                        if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
+                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 8):entity.wordOffset_end])
+                            #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
+                            #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
+                            item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
+                            data_x.append(item_x)
+                            points_entitys.append(entity)
+                            break
+                        p_sentences += 1
+                p_entitys += 1
+        
+        if len(points_entitys)==0:
+            return None
+        
+        return [data_x,points_entitys, text_list]
+    
+    def predict_role(self,list_sentences, list_entitys):
+        datas = self.search_role_data(list_sentences, list_entitys)
+
+        if datas is None:
+            return
+        points_entitys = datas[1]
+        text_list = datas[2]
+
+
+        if USE_PAI_EAS:
+            _data = datas[0]
+            _data = np.transpose(np.array(_data),(1,0,2))
+            request = tf_predict_pb2.PredictRequest()
+            request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
+            request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
+            request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
+            request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
+            request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
+            request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
+            request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
+            request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
+            request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
+            request_data = request.SerializeToString()
+            list_outputs = ["outputs"]
+            _result = vpc_requests(role_url, role_authorization, request_data, list_outputs)
+            if _result is not None:
+                predict_y = _result["outputs"]
+            else:
+                predict_y = self.model_role.predict(datas[0])
+        else:
+            predict_y = self.model_role.predict(np.array(datas[0],dtype=np.float64))
+        for i in range(len(predict_y)):
+            entity = points_entitys[i]
+            label = np.argmax(predict_y[i])
+            values = predict_y[i]
+            text = text_list[i]
+            if label == 2:
+                if re.search('中标单位和.{,25}签订合同', text):
+                    label = 0
+                    values[label] = 0.501
+                elif re.search('尊敬的供应商:.{,25}我公司', text):
+                    label = 0
+                    values[label] = 0.801
+            elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
+                label = 0
+                values[label] = 0.501
+            elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', text[:-10]):
+                label = 2
+                values[label] = 0.501
+            entity.set_Role(label, values)
+
+    def predict_money(self,list_sentences,list_entitys):
+        datas = self.search_money_data(list_sentences, list_entitys)
+        if datas is None:
+            return
+        points_entitys = datas[1]
+        _data = datas[0]
+        text_list = datas[2]
+        if USE_PAI_EAS:
+            _data = np.transpose(np.array(_data),(1,0,2,3))
+            request = tf_predict_pb2.PredictRequest()
+            request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
+            request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
+            request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
+            request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
+            request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
+            request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
+            request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
+            request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
+            request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
+            request_data = request.SerializeToString()
+            list_outputs = ["outputs"]
+            _result = vpc_requests(money_url, money_authorization, request_data, list_outputs)
+            if _result is not None:
+                predict_y = _result["outputs"]
+            else:
+                predict_y = self.model_money.predict(_data)
+        else:
+            predict_y = self.model_money.predict(_data)
+        for i in range(len(predict_y)):
+            entity = points_entitys[i]
+            label = np.argmax(predict_y[i])
+            values = predict_y[i]
+            text = text_list[i]
+            if label == 1 and re.search('[::,。](总金额|总价|单价)', text):
+                values[label] = 0.49
+            elif label ==0 and entity.notes in ["投资", "工程造价"]:
+                values[label] = 0.49
+            entity.set_Money(label, values)
+        
+    def predict(self,list_sentences,list_entitys):
+        self.predict_role(list_sentences,list_entitys)
+        self.predict_money(list_sentences,list_entitys)
+        
+        
+#联系人模型    
+class EPCPredict():
+    
+    def __init__(self):
+        self.model_person = Model_person_classify()
+
+
+    
+    def search_person_data(self,list_sentences,list_entitys):
+        '''
+        @summary:根据句子list和实体list查询联系人模型的输入数据
+        @param:
+            list_sentences:文章的sentences
+            list_entitys:文章的entitys
+        @return:联系人模型的输入数据
+        '''
+
+        data_x = []
+        points_entitys = []
+        for list_entity,list_sentence in zip(list_entitys,list_sentences):
+            
+            p_entitys = 0
+            dict_index_sentence = {}
+            for _sentence in list_sentence:
+                dict_index_sentence[_sentence.sentence_index] = _sentence
+
+            _list_entity = [entity for entity in list_entity if entity.entity_type=="person"]
+            while(p_entitys<len(_list_entity)):
+                entity = _list_entity[p_entitys]
+                if entity.entity_type=="person":
+                    sentence = dict_index_sentence[entity.sentence_index]
+
+                    item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
+                    data_x.append(item_x)
+                    points_entitys.append(entity)
+
+                p_entitys += 1
+
+        if len(points_entitys)==0:
+            return None
+        
+        # return [data_x,points_entitys,dianhua]
+        return [data_x,points_entitys]
+
+    def predict_person(self,list_sentences, list_entitys):
+        datas = self.search_person_data(list_sentences, list_entitys)
+        if datas is None:
+            return
+        points_entitys = datas[1]
+        # phone = datas[2]
+        if USE_PAI_EAS:
+            _data = datas[0]
+            _data = np.transpose(np.array(_data),(1,0,2,3))
+            request = tf_predict_pb2.PredictRequest()
+            request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
+            request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
+            request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
+            request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
+            request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
+            request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
+            request_data = request.SerializeToString()
+            list_outputs = ["outputs"]
+            _result = vpc_requests(person_url, person_authorization, request_data, list_outputs)
+            if _result is not None:
+                predict_y = _result["outputs"]
+            else:
+                predict_y = self.model_person.predict(datas[0])
+        else:
+            predict_y = self.model_person.predict(datas[0])
+        # assert len(predict_y)==len(points_entitys)==len(phone)
+        assert len(predict_y)==len(points_entitys)
+        for i in range(len(predict_y)):
+            entity = points_entitys[i]
+            label = np.argmax(predict_y[i])
+            values = []
+            for item in predict_y[i]:
+                values.append(item)
+            # phone_number = phone[i]
+            # entity.set_Person(label,values,phone_number)
+            entity.set_Person(label,values,[])
+        # 为联系人匹配电话
+        # self.person_search_phone(list_sentences, list_entitys)
+
+    def person_search_phone(self,list_sentences, list_entitys):
+        def phoneFromList(phones):
+            # for phone in phones:
+            #     if len(phone)==11:
+            #         return re.sub('电话[:|:]|联系方式[:|:]','',phone)
+            return re.sub('电话[:|:]|联系方式[:|:]', '', phones[0])
+
+        for list_entity, list_sentence in zip(list_entitys, list_sentences):
+            # p_entitys = 0
+            # p_sentences = 0
+            #
+            # key_word = re.compile('电话[:|:].{0,4}\d{7,12}|联系方式[:|:].{0,4}\d{7,12}')
+            # # phone = re.compile('1[3|4|5|7|8][0-9][-—-]?\d{4}[-—-]?\d{4}|\d{3,4}[-—-]\d{7,8}/\d{3,8}|\d{3,4}[-—-]\d{7,8}转\d{1,4}|\d{3,4}[-—-]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}')  # 联系电话
+            # # 2020/11/25 增加发现的号码段
+            # phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-]?\d{4}[-—-]?\d{4}|'
+            #                    '\d{3,4}[-—-][1-9]\d{6,7}/\d{3,8}|'
+            #                    '\d{3,4}[-—-]\d{7,8}转\d{1,4}|'
+            #                    '\d{3,4}[-—-]?[1-9]\d{6,7}|'
+            #                    '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
+            #                    '[1-9]\d{6,7}')  # 联系电话
+            # dict_index_sentence = {}
+            # for _sentence in list_sentence:
+            #     dict_index_sentence[_sentence.sentence_index] = _sentence
+            #
+            # dict_context_itemx = {}
+            # last_person = "####****++++$$^"
+            # last_person_phone = "####****++++$^"
+            # _list_entity = [entity for entity in list_entity if entity.entity_type == "person"]
+            # while (p_entitys < len(_list_entity)):
+            #     entity = _list_entity[p_entitys]
+            #     if entity.entity_type == "person" and entity.label in [1,2,3]:
+            #         sentence = dict_index_sentence[entity.sentence_index]
+            #         # item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_PERSON_INPUT_SHAPE[1]),shape=settings.MODEL_PERSON_INPUT_SHAPE)
+            #
+            #         # s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20)
+            #
+            #         # 2021/5/8 取上下文的句子,解决表格处理的分句问题
+            #         left_sentence = dict_index_sentence.get(entity.sentence_index - 1)
+            #         left_sentence_tokens = left_sentence.tokens if left_sentence else []
+            #         right_sentence = dict_index_sentence.get(entity.sentence_index + 1)
+            #         right_sentence_tokens = right_sentence.tokens if right_sentence else []
+            #         entity_beginIndex = entity.begin_index + len(left_sentence_tokens)
+            #         entity_endIndex = entity.end_index + len(left_sentence_tokens)
+            #         context_sentences_tokens = left_sentence_tokens + sentence.tokens + right_sentence_tokens
+            #         s = spanWindow(tokens=context_sentences_tokens, begin_index=entity_beginIndex,
+            #                        end_index=entity_endIndex, size=20)
+            #
+            #         _key = "".join(["".join(x) for x in s])
+            #         if _key in dict_context_itemx:
+            #             _dianhua = dict_context_itemx[_key][0]
+            #         else:
+            #             s1 = ''.join(s[1])
+            #             # s1 = re.sub(',)', '-', s1)
+            #             s1 = re.sub('\s', '', s1)
+            #             have_key = re.findall(key_word, s1)
+            #             have_phone = re.findall(phone, s1)
+            #             s0 = ''.join(s[0])
+            #             # s0 = re.sub(',)', '-', s0)
+            #             s0 = re.sub('\s', '', s0)
+            #             have_key2 = re.findall(key_word, s0)
+            #             have_phone2 = re.findall(phone, s0)
+            #
+            #             s3 = ''.join(s[1])
+            #             # s0 = re.sub(',)', '-', s0)
+            #             s3 = re.sub(',|,|\s', '', s3)
+            #             have_key3 = re.findall(key_word, s3)
+            #             have_phone3 = re.findall(phone, s3)
+            #
+            #             s4 = ''.join(s[0])
+            #             # s0 = re.sub(',)', '-', s0)
+            #             s4 = re.sub(',|,|\s', '', s0)
+            #             have_key4 = re.findall(key_word, s4)
+            #             have_phone4 = re.findall(phone, s4)
+            #
+            #             _dianhua = ""
+            #             if have_phone:
+            #                 if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
+            #                         last_person_phone) != -1:
+            #                     if len(have_phone) > 1:
+            #                         _dianhua = phoneFromList(have_phone[1:])
+            #                 else:
+            #                     _dianhua = phoneFromList(have_phone)
+            #             elif have_key:
+            #                 if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
+            #                         last_person_phone) != -1:
+            #                     if len(have_key) > 1:
+            #                         _dianhua = phoneFromList(have_key[1:])
+            #                 else:
+            #                     _dianhua = phoneFromList(have_key)
+            #             elif have_phone2:
+            #                 if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
+            #                         last_person_phone) != -1:
+            #                     if len(have_phone2) > 1:
+            #                         _dianhua = phoneFromList(have_phone2[1:])
+            #                 else:
+            #                     _dianhua = phoneFromList(have_phone2)
+            #             elif have_key2:
+            #                 if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
+            #                         last_person_phone) != -1:
+            #                     if len(have_key2) > 1:
+            #                         _dianhua = phoneFromList(have_key2[1:])
+            #                 else:
+            #                     _dianhua = phoneFromList(have_key2)
+            #             elif have_phone3:
+            #                 if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
+            #                         last_person_phone) != -1:
+            #                     if len(have_phone3) > 1:
+            #                         _dianhua = phoneFromList(have_phone3[1:])
+            #                 else:
+            #                     _dianhua = phoneFromList(have_phone3)
+            #             elif have_key3:
+            #                 if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
+            #                         last_person_phone) != -1:
+            #                     if len(have_key3) > 1:
+            #                         _dianhua = phoneFromList(have_key3[1:])
+            #                 else:
+            #                     _dianhua = phoneFromList(have_key3)
+            #             elif have_phone4:
+            #                 if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
+            #                         last_person_phone) != -1:
+            #                     if len(have_phone4) > 1:
+            #                         _dianhua = phoneFromList(have_phone4)
+            #                 else:
+            #                     _dianhua = phoneFromList(have_phone4)
+            #             elif have_key4:
+            #                 if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
+            #                         last_person_phone) != -1:
+            #                     if len(have_key4) > 1:
+            #                         _dianhua = phoneFromList(have_key4)
+            #                 else:
+            #                     _dianhua = phoneFromList(have_key4)
+            #             else:
+            #                 _dianhua = ""
+            #             # dict_context_itemx[_key] = [item_x, _dianhua]
+            #             dict_context_itemx[_key] = [_dianhua]
+            #         # points_entitys.append(entity)
+            #         # dianhua.append(_dianhua)
+            #         last_person = entity.entity_text
+            #         if _dianhua:
+            #             # 更新联系人entity联系方式(person_phone)
+            #             entity.person_phone = _dianhua
+            #             last_person_phone = _dianhua
+            #         else:
+            #             last_person_phone = "####****++++$^"
+            #     p_entitys += 1
+
+            from scipy.optimize import linear_sum_assignment
+            from BiddingKG.dl.interface.Entitys import Match
+            def dispatch(match_list):
+                main_roles = list(set([match.main_role for match in match_list]))
+                attributes = list(set([match.attribute for match in match_list]))
+
+                label = np.zeros(shape=(len(main_roles), len(attributes)))
+                for match in match_list:
+                    main_role = match.main_role
+                    attribute = match.attribute
+                    value = match.value
+                    label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
+                # print(label)
+                gragh = -label
+                # km算法
+                row, col = linear_sum_assignment(gragh)
+                max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
+                return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
+            # km算法
+            key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})')
+            phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
+                               '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
+                               '0\d{2,3}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
+                               '0\d{2,3}[-—-―]\d{7,8}转\d{1,4}|'
+                               '0\d{2,3}[-—-―]?[1-9]\d{6,7}|'
+                               '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
+                               '[1-9]\d{6,7}')
+            phone_entitys = []
+            for _sentence in list_sentence:
+                sentence_text = _sentence.sentence_text
+                res_set = set()
+                for i in re.finditer(phone,sentence_text):
+                    res_set.add((i.group(),i.start(),i.end()))
+                for i in re.finditer(key_word,sentence_text):
+                    res_set.add((i.group(2),i.start()+len(i.group(1)),i.end()))
+                for item in list(res_set):
+                    phone_left = sentence_text[max(0,item[1]-10):item[1]]
+                    phone_right = sentence_text[item[2]:item[2]+8]
+                    # 排除传真号 和 其它错误项
+                    if re.search("传,?真|信,?箱|邮,?箱",phone_left):
+                        if not re.search("电,?话",phone_left):
+                            continue
+                    if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]",phone_left):
+                        continue
+                    if re.search("[.,]\d{2,}",phone_right):
+                        continue
+                    _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2],in_attachment=_sentence.in_attachment)
+                    phone_entitys.append(_entity)
+            person_entitys = []
+            for entity in list_entity:
+                if entity.entity_type == "person":
+                    entity.person_phone = ""
+                    person_entitys.append(entity)
+            _list_entity = phone_entitys + person_entitys
+            _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin))
+
+            words_num_dict = dict()
+            last_words_num = 0
+            list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
+            for sentence in list_sentence:
+                _index = sentence.sentence_index
+                if _index == 0:
+                    words_num_dict[_index] = 0
+                else:
+                    words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
+                last_words_num = len(sentence.sentence_text)
+            match_list = []
+            for index in range(len(_list_entity)):
+                entity = _list_entity[index]
+                if entity.entity_type=="person" and entity.label in [1,2,3]:
+                    match_nums = 0
+                    for after_index in range(index + 1, min(len(_list_entity), index + 5)):
+                        after_entity = _list_entity[after_index]
+                        if after_entity.entity_type=="phone":
+                            sentence_distance = after_entity.sentence_index - entity.sentence_index
+                            distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - (
+                                    words_num_dict[entity.sentence_index] + entity.wordOffset_end)
+                            if sentence_distance < 2 and distance < 50:
+                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                match_list.append(Match(entity, after_entity, value))
+                                match_nums += 1
+                            else:
+                                break
+                        if after_entity.entity_type=="person":
+                            if after_entity.label not in [1,2,3]:
+                                break
+                    if not match_nums:
+                        for previous_index in range(index-1, max(0,index-5), -1):
+                            previous_entity = _list_entity[previous_index]
+                            if previous_entity.entity_type == "phone":
+                                sentence_distance = entity.sentence_index - previous_entity.sentence_index
+                                distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - (
+                                        words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end)
+                                if sentence_distance < 1 and distance<30:
+                                    # 前向 没有 /10000
+                                    value = (-1 / 2 * (distance ** 2))
+                                    match_list.append(Match(entity, previous_entity, value))
+                                else:
+                                    break
+
+            result = dispatch(match_list)
+            for match in result:
+                entity = match.main_role
+                # 更新 list_entity
+                entity_index = list_entity.index(entity)
+                list_entity[entity_index].person_phone = match.attribute.entity_text
+
+
+    def predict(self,list_sentences,list_entitys):
+        self.predict_person(list_sentences,list_entitys)
+            
+#表格预测
+class FormPredictor():
+    
+    def __init__(self,lazyLoad=getLazyLoad()):
+        self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
+        self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
+        self.model_form_item = Model_form_item()
+        self.model_form_context = Model_form_context()
+        self.model_dict = {"line":[None,self.model_file_line]}
+        
+        
+    def getModel(self,type):
+        if type=="item":
+            return self.model_form_item
+        elif type=="context":
+            return self.model_form_context
+        else:
+            return self.getModel(type)
+
+    def encode(self,data,**kwargs):
+        return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0]
+        return encodeInput_form(data)
+
+    
+    def predict(self,form_datas,type):
+        if type=="item":
+            return self.model_form_item.predict(form_datas)
+        elif type=="context":
+            return self.model_form_context.predict(form_datas)
+        else:
+            return self.getModel(type).predict(form_datas)
+
+    
+
+#角色规则
+#依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
+class RoleRulePredictor():
+    
+    def __init__(self):
+        # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
+        self.pattern_tenderee_left = "(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
+                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
+                                "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
+        self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
+                                     "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
+                                     "(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
+        self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托))"
+        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^现委托|^的\w{2,10}正在进行)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
+        self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
+        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
+        self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
+        # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
+        self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
+                                        "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
+                                        "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
+        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)(名称)?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
+        # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
+        # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
+        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
+                                        "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^成为[\w、()()]+项目的成交供应商))"
+        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)|中标通知书.{,15}你方"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
+
+        # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
+
+        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
+        self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
+        
+        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
+        self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
+
+        self.pattern_whole = [self.pattern_tenderee_left,
+                              self.pattern_tenderee_left_w1,
+                              self.pattern_tenderee_center,
+                              self.pattern_tenderee_right,
+                              self.pattern_tendereeORagency_right,
+                              self.pattern_agency_left,
+                              self.pattern_agency_right,
+                              self.pattern_winTenderer_left,
+                              self.pattern_winTenderer_left_w1,
+                              self.pattern_winTenderer_whole,
+                              self.pattern_winTenderer_right,
+                              self.pattern_secondTenderer_left,
+                              self.pattern_secondTenderer_right,
+                              self.pattern_thirdTenderer_left,
+                              self.pattern_thirdTenderer_right
+                              ]  # 需按顺序排列, 第二、三中标要在中标正则后面
+
+        self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
+        
+        self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
+        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况")
+        self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
+        self.pattern_money_other = re.compile("代理费|服务费")
+        self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
+        
+    def _check_input(self,text, ignore=False):
+        if not text:
+            return []
+        
+        if not isinstance(text, list):
+            text = [text]
+        
+        null_index = [i for i, t in enumerate(text) if not t]
+        if null_index and not ignore:
+            raise Exception("null text in input ")
+        
+        return text
+
+
+    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
+
+        for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
+                                                                      list_codenames):
+            list_sentence.sort(key=lambda x: x.sentence_index)  # 2022/1/5 按句子顺序排序
+            # list_name = list_codename["name"]
+            list_name = []  # 2022/1/5  改为实体列表内所有项目名称
+            for entity in list_entity:
+                if entity.entity_type == 'name':
+                    list_name.append(entity.entity_text)
+            list_name = self._check_input(list_name) + [article.title]
+            for p_entity in list_entity:
+
+                if p_entity.entity_type in ["org", "company"]:
+                    # 只解析角色为无的或者概率低于阈值的
+                    if p_entity.label is None:
+                        continue
+                    # 将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
+                    if str(p_entity.label) == "0":
+                        find_flag = False
+                        for _sentence in list_sentence:
+                            if _sentence.sentence_index == p_entity.sentence_index:
+                                _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                                                   end_index=p_entity.end_index, size=20, center_include=True,
+                                                   word_flag=True, use_text=True,
+                                                   text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
+                                for _name in list_name:
+                                    if _name != "" and str(_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:
+                                        find_flag = True
+                                        if p_entity.values[0] > on_value:
+                                            p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
+                                        else:
+                                            p_entity.values[0] = on_value  # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
+                        if find_flag:
+                            continue
+
+                    # 正则从概率低于阈值或其他类别中召回角色
+                    role_prob = float(p_entity.values[int(p_entity.label)])
+                    if role_prob < on_value or str(p_entity.label) == "5":
+                        # 将标题中的实体置为招标人
+                        _list_name = self._check_input(list_name, ignore=True)
+                        find_flag = False
+                        for _name in _list_name:  # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
+                            if str(_name).find(re.sub(")", ")", re.sub("(", "(",
+                                                                       p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4:
+                                for _sentence in list_sentence:
+                                    if _sentence.sentence_index == p_entity.sentence_index:
+                                        _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                                                           end_index=p_entity.end_index, size=20, center_include=True,
+                                                           word_flag=True, use_text=True, text=re.sub(")", ")",
+                                                                                                      re.sub("(", "(",
+                                                                                                             p_entity.entity_text)))
+                                        if str(_span[1] + _span[2][:len(str(_name))]).find(
+                                                _name) >= 0:
+                                            find_flag = True
+                                            _label = 0
+                                            p_entity.label = _label
+                                            p_entity.values[int(_label)] = on_value
+                                            break
+                                    if p_entity.sentence_index >= 4:
+                                        break
+                            if find_flag:
+                                break
+                            # if str(_name).find(p_entity.entity_text)>=0:
+                            #     find_flag = True
+                            #     _label = 0
+                            #     p_entity.label = _label
+                            #     p_entity.values[int(_label)] = on_value
+                            #     break
+                        # 若是实体在标题中,默认为招标人,不进行以下的规则匹配
+                        if find_flag:
+                            continue
+
+                        for s_index in range(len(list_sentence)):
+                            if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \
+                                    list_sentence[s_index].sentence_index:
+                                tokens = list_sentence[s_index].tokens
+                                begin_index = p_entity.begin_index
+                                end_index = p_entity.end_index
+                                size = 15
+                                spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
+                                                   word_flag=True, use_text=False)
+                                # _flag = False
+
+                                # 使用正则+距离解决冲突
+                                # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
+                                list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:10], spans[2]] # 实体左、中、右 信息
+                                for _i_span in range(len(list_spans)):
+                                    _flag = False
+                                    _prob_weight = 1
+
+                                    # print(list_spans[_i_span],p_entity.entity_text)
+                                    for _pattern in self.pattern_whole:
+                                        for _iter in re.finditer(_pattern, list_spans[_i_span]):
+                                            for _group, _v_group in _iter.groupdict().items():
+                                                if _v_group is not None and _v_group != "":
+                                                    _role = _group.split("_")[0]
+                                                    if _role == "tendereeORagency":   # 2022/3/9 新增不确定招标代理判断逻辑
+                                                        # print('p_entity_sentenceindex:', p_entity.sentence_index)
+                                                        if p_entity.sentence_index>=1:  # 只在第一句进行这种模糊匹配
+                                                            continue
+                                                        if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
+                                                            or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
+                                                            _role = 'tenderee'
+                                                        else:
+                                                            _role = "agency"
+                                                    _direct = _group.split("_")[1]
+                                                    _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
+                                                    # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                                    #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                                                                                                        list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
+                                                        _flag = True
+                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                                        _prob_weight = 1.2 if _weight=='w1' else 1
+                                                        # print('_v_group:',_group, _v_group, p_entity.entity_text)
+
+                                                    if _i_span == 1 and _direct == "center":
+                                                        _flag = True
+                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                                        _prob_weight = 1.2 if _weight == 'w1' else 1
+                                                        # print('_v_group:', _group, _v_group, p_entity.entity_text)
+
+                                                    if _i_span == 2 and _direct == "right":
+                                                        _flag = True
+                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                                        _prob_weight = 1.2 if _weight == 'w1' else 1
+                                                        # print('_v_group:', _group, _v_group, p_entity.entity_text)
+
+                                    # 得到结果
+                                    if _flag:
+                                        p_entity.label = _label
+                                        p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
+                                        # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group,  _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
+                                        break
+
+                # 其他金额通过正则召回可能是招标或中投标的金额
+                if p_entity.entity_type in ["money"]:
+                    if str(p_entity.label) == "2":
+                        for _sentence in list_sentence:
+                            if _sentence.sentence_index == p_entity.sentence_index:
+                                _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                                                   end_index=p_entity.end_index, size=20, center_include=True,
+                                                   word_flag=True, text=p_entity.entity_text)
+                                if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
+                                        self.pattern_money_other, _span[0]) is None:
+                                    p_entity.values[0] = 0.8 + p_entity.values[0] / 10
+                                    p_entity.label = 0
+                                if re.search(self.pattern_money_tenderer, _span[0]) is not None:
+                                    if re.search(self.pattern_money_other, _span[0]) is not None:
+                                        if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
+                                                re.search(self.pattern_money_other, _span[0]).span()[1]:
+                                            p_entity.values[1] = 0.8 + p_entity.values[1] / 10
+                                            p_entity.label = 1
+                                    else:
+                                        p_entity.values[1] = 0.8 + p_entity.values[1] / 10
+                                        p_entity.label = 1
+                                if re.search(self.pattern_money_tenderer_whole,
+                                             "".join(_span)) is not None and re.search(self.pattern_money_other,
+                                                                                       _span[0]) is None:
+                                    p_entity.values[1] = 0.8 + p_entity.values[1] / 10
+                                    p_entity.label = 1
+
+            # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
+            list_p = []
+            state = 0
+            for p_entity in list_entity:
+                for _sentence in list_sentence:
+                    if _sentence.sentence_index == p_entity.sentence_index:
+                        _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
+                                           end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
+                                           text=p_entity.entity_text)
+
+                        if state == 2:
+                            for _p in list_p[1:]:
+                                _p.values[0] = 0.8 + _p.values[0] / 10
+                                _p.label = 0
+                            state = 0
+                            list_p = []
+
+                        if state == 0:
+                            if p_entity.entity_type in ["money"]:
+                                if str(p_entity.label) == "0" and re.search(self.pattern_pack,
+                                                                            _span[0] + "-" + _span[2]) is not None:
+                                    state = 1
+                                    list_p.append(p_entity)
+                        elif state == 1:
+                            if p_entity.entity_type in ["money"]:
+                                if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack,
+                                                                                   _span[0] + "-" + _span[
+                                                                                       2]) is not None and re.search(
+                                        self.pattern_money_other,
+                                        _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[
+                                    0].sentence_index:
+                                    list_p.append(p_entity)
+                                else:
+                                    state = 2
+
+            if len(list_p) > 1:
+                for _p in list_p[1:]:
+                    # print("==",_p.entity_text,_p.sentence_index,_p.label)
+                    _p.values[0] = 0.8 + _p.values[0] / 10
+                    _p.label = 0
+                state = 0
+                list_p = []
+
+            for p_entity in list_entity:
+                # 将属于集合中的不可能是中标人的标签置为无
+                if p_entity.entity_text in self.SET_NOT_TENDERER:
+                    p_entity.label = 5
+
+'''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
+class RoleRuleFinalAdd():
+    def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
+        # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
+        main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
+        end_tokens = []
+        for sentence in main_sentences[-5:]:
+            end_tokens.extend(sentence.tokens)
+        text_end = "".join(end_tokens[-30:])
+        # print(text_end)
+        # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
+        sear_ent = re.search('[,。;]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
+        sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
+        sear_ent3 = re.search('(报名咨询|[收送交]货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
+        sear_ent4 = re.search('(发布(?:人|单位|机构))[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
+        sear_list = [sear_ent4 , sear_ent3 , sear_ent2 , sear_ent]
+
+        tenderee_notfound = True
+        agency_notfound = True
+        ents = []
+        for ent in list_entitys[0]:
+            if ent.entity_type in ['org', 'company']:
+                if ent.label == 0:
+                    tenderee_notfound = False
+                elif ent.label == 1:
+                    agency_notfound = False
+                elif ent.label == 5:
+                    ents.append(ent)
+        if sear_ent or sear_ent2 or sear_ent3 or sear_ent4:
+            for _sear_ent in [_sear for _sear in sear_list if _sear]:
+                # if sear_ent4:
+                #     ent_re = sear_ent4.group(2)
+                # elif sear_ent3:
+                #     ent_re = sear_ent3.group(2)
+                # elif sear_ent2:
+                #     ent_re = sear_ent2.group(2)
+                # else:
+                #     ent_re = sear_ent.group(1)
+                if _sear_ent==sear_ent4:
+                    ent_re = _sear_ent.group(2)
+                elif _sear_ent==sear_ent3:
+                    ent_re = _sear_ent.group(2)
+                elif _sear_ent==sear_ent2:
+                    ent_re = _sear_ent.group(2)
+                else:
+                    ent_re = _sear_ent.group(1)
+                # print('ent_re', ent_re)
+                ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
+
+                if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
+                                                  or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
+                    n = 0
+                    for i in range(len(ents) - 1, -1, -1):
+                        if not ents[i].in_attachment:
+                            n += 1
+                        if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
+                            break
+                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
+                            ents[i].label = 0
+                            ents[i].values[0] = 0.5
+                            tenderee_notfound = False
+                            # log('正则最后补充实体: %s'%(ent_re))
+                            break
+                elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
+                    n = 0
+                    for i in range(len(ents) - 1, -1, -1):
+                        if not ents[i].in_attachment:
+                            n += 1
+                        if n > 3 and _sear_ent==sear_ent:  # 文章末尾角色加日期这种只找后三个实体
+                            break
+                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
+                            ents[i].label = 1
+                            ents[i].values[1] = 0.5
+                            agency_notfound = False
+                            # log('正则最后补充实体: %s'%(ent_re))
+                            break
+                if not tenderee_notfound:
+                    break
+
+        elif list_codenames[0]['name'] != "":  #把标题包含的公司实体作为招标人
+            # tenderee_notfound = True
+            # ents = []
+            # for ent in list_entitys[0]:
+            #     if ent.entity_type in ['org', 'company']:
+            #         if ent.label == 0:
+            #             tenderee_notfound = False
+            #         elif ent.label == 1:
+            #             agency_notfound = False
+            #         elif ent.label == 5:
+            #             ents.append(ent)
+            if tenderee_notfound == True:
+                # print('list_codenames',list_codenames[0]['name'])
+                for ent in ents:
+                    if ent.entity_text in list_codenames[0]['name']:
+                        ent.label = 0
+                        ent.values[0] = 0.5
+                        # log('正则召回标题中包含的实体:%s'%ent.entity_text)
+                        break
+
+# 招标人角色召回规则
+class TendereeRuleRecall():
+    def __init__(self):
+        self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|"
+                                        "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::][^。;,]{,5}$")
+
+        self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|"
+                                        "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|"
+                                         "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|"
+                                         "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|"
+                                         "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|"
+                                         "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)")
+        self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)")
+        self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P<project>[^。;,?!::]+)")
+        # 公告主语判断规则
+        self.subject = re.compile("[我本][院校局]")
+        # 未识别实体召回正则
+        self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
+                                        "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
+                                        "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
+        self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
+                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
+                                "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
+        # 未识别实体尾部判断
+        self.unrecognized_end1 = re.compile(".{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心)")
+        self.unrecognized_end2 = re.compile(".{4,}(?:署|局|厅|处|室|科|部|站|所|股|行)")
+
+    def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
+        # tenderee_notfound = True
+        # agency_notfound = True
+        self.get_tenderee = False
+        ents = []
+        list_name = []
+        for ent in list_entitys[0]:
+            if ent.entity_type == 'name':
+                list_name.append(ent.entity_text)
+            if ent.entity_type in ['org', 'company']:
+                if ent.label == 0:
+                    # tenderee_notfound = False
+                    self.get_tenderee = True
+                # elif ent.label == 1:
+                #     agency_notfound = False
+                elif ent.label == 5:
+                    ents.append(ent)
+        if not self.get_tenderee:
+            self.entity_context_rule(ents,list_name,list_sentences)
+        if not self.get_tenderee:
+            self.subject_rule(ents,list_articles,list_sentences)
+        if not self.get_tenderee:
+            self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
+        if not self.get_tenderee:
+            self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
+
+    #entity上下文正则判断
+    def entity_context_rule(self,entitys,list_name,list_sentences):
+        for ent in entitys:
+            _sentence = list_sentences[0][ent.sentence_index]
+            _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
+                               end_index=ent.end_index, size=40, center_include=True,
+                               word_flag=True, use_text=True,
+                               text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
+            if re.search(self.tenderee_left,_span[0]):
+                ent.label = 0
+                ent.values[0] = 0.5 + ent.values[0] / 10
+                self.get_tenderee = True
+            elif re.search(self.tenderee_right,_span[2]):
+                ent.label = 0
+                ent.values[0] = 0.5 + ent.values[0] / 10
+                self.get_tenderee = True
+            elif re.search(self.tenderee_right2, _span[2]):
+                ent.label = 0
+                ent.values[0] = 0.5 + ent.values[0] / 10
+                self.get_tenderee = True
+            elif list_name:
+                pj_name = re.search(self.tenderee_right3, _span[2])
+                if pj_name:
+                    pj_name = pj_name.groupdict()["project"]
+                    for _name in list_name:
+                        if _name in pj_name:
+                            ent.label = 0
+                            ent.values[0] = 0.5
+                            self.get_tenderee = True
+                            break
+    # 公告主语判断
+    def subject_rule(self, entitys,list_articles,list_sentences):
+        content = list_articles[0].content.split('##attachment##')[0]
+        if re.search(self.subject,content):
+            _subject = re.search(self.subject,content).group()
+            for ent in entitys:
+                if re.search("院",_subject) and re.search("医院|学院",ent.entity_text):
+                    ent.label = 0
+                    ent.values[0] = 0.5 + ent.values[0] / 10
+                    self.get_tenderee = True
+                elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text):
+                    ent.label = 0
+                    ent.values[0] = 0.5 + ent.values[0] / 10
+                    self.get_tenderee = True
+                elif re.search("局", _subject) and re.search("局", ent.entity_text):
+                    _sentence = list_sentences[0][ent.sentence_index]
+                    _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
+                                       end_index=ent.end_index, size=20, center_include=True,
+                                       word_flag=True, use_text=True,
+                                       text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
+                    if not re.search("监督|投诉",_span[0][-10:]):
+                        ent.label = 0
+                        ent.values[0] = 0.5 + ent.values[0] / 10
+                        self.get_tenderee = True
+
+    # 正则召回未识别实体
+    def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
+        list_sentence = list_sentences[0]
+        for in_attachment in [False,True]:
+            for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
+                sentence_text = sentence.sentence_text
+                tokens = sentence.tokens
+                doc_id = sentence.doc_id
+                in_attachment = sentence.in_attachment
+                list_tokenbegin = []
+                begin = 0
+                for i in range(0, len(tokens)):
+                    list_tokenbegin.append(begin)
+                    begin += len(str(tokens[i]))
+                list_tokenbegin.append(begin + 1)
+                for _match in re.finditer(pattern,sentence_text):
+                    _groupdict = _match.groupdict()
+                    _match_text = _match.group()
+                    _unrecognized_text = _groupdict["unrecognized"]
+                    # print(_unrecognized_text)
+                    # if _match_text[-1] in [':',':']:
+                    #     _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
+                    #     if not _unrecognized:
+                    #         _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
+                    #     if _unrecognized:
+                    #         _unrecognized = _unrecognized.group()
+                    #     else:
+                    #         continue
+                    # else:
+                    #     _unrecognized = _unrecognized_text
+                    _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
+                    if not _unrecognized:
+                        _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
+                    if _unrecognized:
+                        _unrecognized = _unrecognized.group()
+                    else:
+                        continue
+                    # print(_unrecognized)
+                    if re.search("某",_unrecognized):
+                        continue
+                    begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
+                    for j in range(len(list_tokenbegin)):
+                        if list_tokenbegin[j] == begin_index_temp:
+                            begin_index = j
+                            break
+                        elif list_tokenbegin[j] > begin_index_temp:
+                            begin_index = j - 1
+                            break
+                    index = begin_index_temp + len(_unrecognized)
+                    end_index_temp = index
+                    for j in range(begin_index, len(list_tokenbegin)):
+                        if list_tokenbegin[j] >= index:
+                            end_index = j - 1
+                            break
+                    entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
+                    entity_text = _unrecognized
+                    new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
+                               begin_index_temp, end_index_temp, in_attachment=in_attachment)
+                    new_entity.label = 0
+                    new_entity.values = [on_value,0,0,0,0,0]
+                    list_entitys[0].append(new_entity)
+                    self.get_tenderee = True
+            if self.get_tenderee:
+                list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
+                break
+
+# 时间类别
+class TimePredictor():
+    def __init__(self):
+        self.sess = tf.Session(graph=tf.Graph())
+        self.inputs_code = None
+        self.outputs_code = None
+        self.input_shape = (2,40,128)
+        self.load_model()
+
+    def load_model(self):
+        model_path = os.path.dirname(__file__)+'/timesplit_model'
+        if self.inputs_code is None:
+            log("get model of time")
+            with self.sess.as_default():
+                with self.sess.graph.as_default():
+                    meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path)
+                    signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+                    signature_def = meta_graph_def.signature_def
+                    self.inputs_code = []
+                    self.inputs_code.append(
+                        self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
+                    self.inputs_code.append(
+                        self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
+                    self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
+                    return self.inputs_code, self.outputs_code
+        else:
+            return self.inputs_code, self.outputs_code
+
+    def search_time_data(self,list_sentences,list_entitys):
+        data_x = []
+        points_entitys = []
+        for list_sentence, list_entity in zip(list_sentences, list_entitys):
+            p_entitys = 0
+            p_sentences = 0
+            list_sentence.sort(key=lambda x: x.sentence_index)
+            while(p_entitys<len(list_entity)):
+                entity = list_entity[p_entitys]
+                if entity.entity_type in ['time']:
+                    while(p_sentences<len(list_sentence)):
+                        sentence = list_sentence[p_sentences]
+                        if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
+                            # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
+                            # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
+                            s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
+                            left = s[0]
+                            right = s[1]
+                            context = [left, right]
+                            x = self.embedding_words(context, shape=self.input_shape)
+                            data_x.append(x)
+                            points_entitys.append(entity)
+                            break
+                        p_sentences += 1
+                p_entitys += 1
+        if len(points_entitys)==0:
+            return None
+        data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
+        return [data_x, points_entitys]
+
+    def embedding_words(self, datas, shape):
+        '''
+        @summary:查找词汇对应的词向量
+        @param:
+            datas:词汇的list
+            shape:结果的shape
+        @return: array,返回对应shape的词嵌入
+        '''
+        model_w2v = getModel_w2v()
+        embed = np.zeros(shape)
+        length = shape[1]
+        out_index = 0
+        for data in datas:
+            index = 0
+            for item in data:
+                item_not_space = re.sub("\s*", "", item)
+                if index >= length:
+                    break
+                if item_not_space in model_w2v.vocab:
+                    embed[out_index][index] = model_w2v[item_not_space]
+                    index += 1
+                else:
+                    embed[out_index][index] = model_w2v['unk']
+                    index += 1
+            out_index += 1
+        return embed
+
+    def predict(self, list_sentences,list_entitys):
+        datas = self.search_time_data(list_sentences, list_entitys)
+        if datas is None:
+            return
+        points_entitys = datas[1]
+        with self.sess.as_default():
+            predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0]
+                ,self.inputs_code[1]:datas[0][1]})[0]
+            for i in range(len(predict_y)):
+                entity = points_entitys[i]
+                label = np.argmax(predict_y[i])
+                values = []
+                for item in predict_y[i]:
+                    values.append(item)
+                if label != 0:
+                    if not timeFormat(entity.entity_text):
+                        label = 0
+                        values[0] = 0.5
+                entity.set_Role(label, values)
+
+# 产品字段提取
+class ProductPredictor():
+    def __init__(self):
+        vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
+        self.vocab = load(vocabpath)
+        self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab)))
+        self.sess = tf.Session(graph=tf.Graph())
+        self.load_model()
+
+    def load_model(self):
+        # model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
+        model_path = os.path.dirname(__file__)+'/product_savedmodel/productAndfailreason.pb'
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                output_graph_def = tf.GraphDef()
+                with open(model_path, 'rb') as f:
+                    output_graph_def.ParseFromString(f.read())
+                    tf.import_graph_def(output_graph_def, name='')
+                    self.sess.run(tf.global_variables_initializer())
+                    self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
+                    self.length = self.sess.graph.get_tensor_by_name("Sum:0")
+                    self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
+                    self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
+                    self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
+
+    def decode(self,logits, lengths, matrix):
+        paths = []
+        small = -1000.0
+        # start = np.asarray([[small] * 4 + [0]])
+        start = np.asarray([[small]*7+[0]])
+        for score, length in zip(logits, lengths):
+            score = score[:length]
+            pad = small * np.ones([length, 1])
+            logits = np.concatenate([score, pad], axis=1)
+            logits = np.concatenate([start, logits], axis=0)
+            path, _ = viterbi_decode(logits, matrix)
+            paths.append(path[1:])
+        return paths
+
+    def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
+        '''
+        预测实体代码,每个句子最多取MAX_AREA个字,超过截断
+        :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
+        :param list_entitys: 多篇公告实体列表
+        :param MAX_AREA: 每个句子最多截取多少字
+        :return: 把预测出来的实体放进实体类
+        '''
+        with self.sess.as_default() as sess:
+            with self.sess.graph.as_default():
+                result = []
+                if fail and list_articles!=[]:
+                    text_list = [list_articles[0].content[:MAX_AREA]]
+                    chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in text] for text in text_list]
+                    if USE_API:
+                        requests_result = requests.post(API_URL + "/predict_product",
+                                               json={"inputs": chars}, verify=True)
+                        batch_paths = json.loads(requests_result.text)['result']
+                        lengths = json.loads(requests_result.text)['lengths']
+                    else:
+                        lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
+                                                          feed_dict={
+                                                              self.char_input: np.asarray(chars),
+                                                              self.dropout: 1.0
+                                                          })
+                        batch_paths = self.decode(scores, lengths, tran_)
+                    for text, path, length in zip(text_list, batch_paths, lengths):
+                        tags = ''.join([str(it) for it in path[:length]])
+                        for it in re.finditer("12*3", tags):
+                            start = it.start()
+                            end = it.end()
+                            _entity = Entity(doc_id=list_articles[0].doc_id, entity_id="%s_%s_%s_%s" % (
+                                list_articles[0].doc_id, 0, start, end),
+                                             entity_text=text[start:end],
+                                             entity_type="product", sentence_index=0,
+                                             begin_index=0, end_index=0, wordOffset_begin=start,
+                                             wordOffset_end=end)
+                            list_entitys[0].append(_entity)
+                        for it in re.finditer("45*6", tags):
+                            start = it.start()
+                            end = it.end()
+                            result.append(text[start:end].replace('?', '').strip())
+                    reasons = []
+                    for it in result:
+                        if "(√)" in it or "(√)" in it:
+                            reasons = [it]
+                            break
+                        if reasons != [] and (it not in reasons[-1] and it not in reasons):
+                            reasons.append(it)
+                        elif reasons == []:
+                            reasons.append(it)
+                    return {'fail_reason':';'.join(reasons)}
+
+                if list_entitys is None:
+                    list_entitys = [[] for _ in range(len(list_sentences))]
+                for list_sentence, list_entity in zip(list_sentences,list_entitys):
+                    if len(list_sentence)==0:
+                        result.append({"product":[]})
+                        continue
+                    list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
+                    _begin_index = 0
+                    item = {"product":[]}
+                    temp_list = []
+                    while True:
+                        MAX_LEN = len(list_sentence[_begin_index].sentence_text)
+                        if MAX_LEN > MAX_AREA:
+                            MAX_LEN = MAX_AREA
+                        _LEN = MAX_AREA//MAX_LEN
+                        chars = [sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
+                        chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in l] for l in chars]
+                        chars = pad_sequences(chars, maxlen=MAX_LEN, padding="post", truncating="post")
+                        if USE_API:
+                            requests_result = requests.post(API_URL + "/predict_product",
+                                                   json={"inputs": chars.tolist()}, verify=True)
+                            batch_paths = json.loads(requests_result.text)['result']
+                            lengths = json.loads(requests_result.text)['lengths']
+                        else:
+                            lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
+                                                              feed_dict={
+                                                                        self.char_input: np.asarray(chars),
+                                                                        self.dropout: 1.0
+                                                                        })
+                            batch_paths = self.decode(scores, lengths, tran_)
+                        for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
+                            tags = ''.join([str(it) for it in path[:length]])
+                            for it in re.finditer("12*3", tags):
+                                start = it.start()
+                                end = it.end()
+                                _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
+                                sentence.doc_id, sentence.sentence_index, start, end),
+                                                 entity_text=sentence.sentence_text[start:end],
+                                                 entity_type="product", sentence_index=sentence.sentence_index,
+                                                 begin_index=0, end_index=0, wordOffset_begin=start,
+                                                 wordOffset_end=end,in_attachment=sentence.in_attachment)
+                                list_entity.append(_entity)
+                                temp_list.append(sentence.sentence_text[start:end])
+                        # item["product"] = list(set(temp_list))
+                        # result.append(item)
+                        if _begin_index+_LEN >= len(list_sentence):
+                            break
+                        _begin_index += _LEN
+                    item["product"] = list(set(temp_list))
+                    result.append(item) # 修正bug
+                return {'fail_reason': ""}
+
+
+# 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
+class ProductAttributesPredictor():
+    def __init__(self,):
+        self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
+        self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
+        with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
+            self.header_set = pickle.load(f)
+    def isTrueTable(self, table):
+        '''真假表格规则:
+        1、包含<caption>或<th>标签为真
+        2、包含大量链接、表单、图片或嵌套表格为假
+        3、表格尺寸太小为假
+        4、外层<table>嵌套子<table>,一般子为真,外为假'''
+        if table.find_all(['caption', 'th']) != []:
+            return True
+        elif len(table.find_all(['form', 'a', 'img'])) > 5:
+            return False
+        elif len(table.find_all(['tr'])) < 2:
+            return False
+        elif len(table.find_all(['table'])) >= 1:
+            return False
+        else:
+            return True
+
+    def getTrs(self, tbody):
+        # 获取所有的tr
+        trs = []
+        objs = tbody.find_all(recursive=False)
+        for obj in objs:
+            if obj.name == "tr":
+                trs.append(obj)
+            if obj.name == "tbody":
+                for tr in obj.find_all("tr", recursive=False):
+                    trs.append(tr)
+        return trs
+
+    def getTable(self, tbody):
+        trs = self.getTrs(tbody)
+        inner_table = []
+        if len(trs) < 2:
+            return inner_table
+        for tr in trs:
+            tr_line = []
+            tds = tr.findChildren(['td', 'th'], recursive=False)
+            if len(tds) < 2:
+                continue
+            for td in tds:
+                td_text = re.sub('\s', '', td.get_text())
+                tr_line.append(td_text)
+            inner_table.append(tr_line)
+        return inner_table
+
+    def fixSpan(self, tbody):
+        # 处理colspan, rowspan信息补全问题
+        trs = self.getTrs(tbody)
+        ths_len = 0
+        ths = list()
+        trs_set = set()
+        # 修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
+        # 遍历每一个tr
+
+        for indtr, tr in enumerate(trs):
+            ths_tmp = tr.findChildren('th', recursive=False)
+            # 不补全含有表格的tr
+            if len(tr.findChildren('table')) > 0:
+                continue
+            if len(ths_tmp) > 0:
+                ths_len = ths_len + len(ths_tmp)
+                for th in ths_tmp:
+                    ths.append(th)
+                trs_set.add(tr)
+            # 遍历每行中的element
+            tds = tr.findChildren(recursive=False)
+            if len(tds) < 3:
+                continue  # 列数太少的不补全
+            for indtd, td in enumerate(tds):
+                # 若有colspan 则补全同一行下一个位置
+                if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "":
+                    col = int(re.sub("[^0-9]", "", str(td['colspan'])))
+                    if col < 10 and len(td.get_text()) < 500:
+                        td['colspan'] = 1
+                        for i in range(1, col, 1):
+                            td.insert_after(copy.copy(td))
+        for indtr, tr in enumerate(trs):
+            ths_tmp = tr.findChildren('th', recursive=False)
+            # 不补全含有表格的tr
+            if len(tr.findChildren('table')) > 0:
+                continue
+            if len(ths_tmp) > 0:
+                ths_len = ths_len + len(ths_tmp)
+                for th in ths_tmp:
+                    ths.append(th)
+                trs_set.add(tr)
+            # 遍历每行中的element
+            tds = tr.findChildren(recursive=False)
+            same_span = 0
+            if len(tds) > 1 and 'rowspan' in tds[0].attrs:
+                span0 = tds[0].attrs['rowspan']
+                for td in tds:
+                    if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0:
+                        same_span += 1
+            if same_span == len(tds):
+                continue
+
+            for indtd, td in enumerate(tds):
+                # 若有rowspan 则补全下一行同样位置
+                if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "":
+                    row = int(re.sub("[^0-9]", "", str(td['rowspan'])))
+                    td['rowspan'] = 1
+                    for i in range(1, row, 1):
+                        # 获取下一行的所有td, 在对应的位置插入
+                        if indtr + i < len(trs):
+                            tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False)
+                            if len(tds1) >= (indtd) and len(tds1) > 0:
+                                if indtd > 0:
+                                    tds1[indtd - 1].insert_after(copy.copy(td))
+                                else:
+                                    tds1[0].insert_before(copy.copy(td))
+                            elif len(tds1) > 0 and len(tds1) == indtd - 1:
+                                tds1[indtd - 2].insert_after(copy.copy(td))
+
+    def get_monthlen(self, year, month):
+        '''输入年份、月份 int类型 得到该月份天数'''
+        try:
+            weekday, num = calendar.monthrange(int(year), int(month))
+        except:
+            num = 30
+        return str(num)
+    def fix_time(self, text, html, page_time):
+        '''输入日期字段返回格式化日期'''
+        for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'),
+                   ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]:
+            if it[0] in text:
+                text = text.replace(it[0], it[1])
+        if re.search('^\d{1,2}月$', text):
+            m = re.search('^(\d{1,2})月$', text).group(1)
+            if len(m) < 2:
+                m = '0' + m
+            year = re.search('(\d{4})年(.{,12}采购意向)?', html)
+            if year:
+                y = year.group(1)
+                num = self.get_monthlen(y, m)
+                if len(num) < 2:
+                    num = '0' + num
+                order_begin = "%s-%s-01" % (y, m)
+                order_end = "%s-%s-%s" % (y, m, num)
+            elif page_time != "":
+                year = re.search('\d{4}', page_time)
+                if year:
+                    y = year.group(0)
+                    num = self.get_monthlen(y, m)
+                    if len(num) < 2:
+                        num = '0' + num
+                    order_begin = "%s-%s-01" % (y, m)
+                    order_end = "%s-%s-%s" % (y, m, num)
+                else:
+                    y = str(datetime.datetime.now().year)
+                    num = self.get_monthlen(y, m)
+                    if len(num) < 2:
+                        num = '0' + num
+                    order_begin = "%s-%s-01" % (y, m)
+                    order_end = "%s-%s-%s" % (y, m, num)
+            else:
+                y = str(datetime.datetime.now().year)
+                num = self.get_monthlen(y, m)
+                if len(num) < 2:
+                    num = '0' + num
+                order_begin = "%s-%s-01" % (y, m)
+                order_end = "%s-%s-%s" % (y, m, num)
+            return order_begin, order_end
+
+        t1 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})月?$', text)
+        if t1:
+            year = t1.group(1)
+            month = t1.group(3)
+            num = self.get_monthlen(year, month)
+            if len(month)<2:
+                month = '0'+month
+            if len(num) < 2:
+                num = '0'+num
+            order_begin = "%s-%s-01" % (year, month)
+            order_end = "%s-%s-%s" % (year, month, num)
+            return order_begin, order_end
+        t2 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})(月|/|\.|-)(\d{1,2})日?$', text)
+        if t2:
+            y = t2.group(1)
+            m = t2.group(3)
+            d = t2.group(5)
+            m = '0'+ m if len(m)<2 else m
+            d = '0'+d if len(d)<2 else d
+            order_begin = order_end = "%s-%s-%s"%(y,m,d)
+            return order_begin, order_end
+        # 时间样式:"202105"
+        t3 = re.search("^(20\d{2})(\d{1,2})$",text)
+        if t3:
+            year = t3.group(1)
+            month = t3.group(2)
+            if int(month)>0 and int(month)<=12:
+                num = self.get_monthlen(year, month)
+                if len(month) < 2:
+                    month = '0' + month
+                if len(num) < 2:
+                    num = '0' + num
+                order_begin = "%s-%s-01" % (year, month)
+                order_end = "%s-%s-%s" % (year, month, num)
+                return order_begin, order_end
+        # 时间样式:"20210510"
+        t4 = re.search("^(20\d{2})(\d{2})(\d{2})$", text)
+        if t4:
+            year = t4.group(1)
+            month = t4.group(2)
+            day = t4.group(3)
+            if int(month) > 0 and int(month) <= 12 and int(day)>0 and int(day)<=31:
+                order_begin = order_end = "%s-%s-%s"%(year,month,day)
+                return order_begin, order_end
+        all_match = re.finditer('^(?P<y1>\d{4})(年|/|\.)(?P<m1>\d{1,2})(?:(月|/|\.)(?:(?P<d1>\d{1,2})日)?)?'
+                                '(到|至|-)(?:(?P<y2>\d{4})(年|/|\.))?(?P<m2>\d{1,2})(?:(月|/|\.)'
+                                '(?:(?P<d2>\d{1,2})日)?)?$', text)
+        y1 = m1 = d1 = y2 = m2 = d2 = ""
+        found_math = False
+        for _match in all_match:
+            if len(_match.group()) > 0:
+                found_math = True
+                for k, v in _match.groupdict().items():
+                    if v!="" and v is not None:
+                        if k == 'y1':
+                            y1 = v
+                        elif k == 'm1':
+                            m1 = v
+                        elif k == 'd1':
+                            d1 = v
+                        elif k == 'y2':
+                            y2 = v
+                        elif k == 'm2':
+                            m2 = v
+                        elif k == 'd2':
+                            d2 = v
+        if not found_math:
+            return "", ""
+        y2 = y1 if y2 == "" else y2
+        d1 = '1' if d1 == "" else d1
+        d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
+        m1 = '0' + m1 if len(m1) < 2 else m1
+        m2 = '0' + m2 if len(m2) < 2 else m2
+        d1 = '0' + d1 if len(d1) < 2 else d1
+        d2 = '0' + d2 if len(d2) < 2 else d2
+        order_begin = "%s-%s-%s"%(y1,m1,d1)
+        order_end = "%s-%s-%s"%(y2,m2,d2)
+        return order_begin, order_end
+
+    def find_header(self, items, p1, p2):
+        '''
+        inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
+        :param items: 列表,内容为每个td 文本内容
+        :param p1: 优先表头正则
+        :param p2: 第二表头正则
+        :return: 表头所在列序号,是否表头,表头内容
+        '''
+        flag = False
+        header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
+        product = ""  # 产品
+        quantity = ""  # 数量
+        unitPrice = ""  # 单价
+        brand = ""  # 品牌
+        specs = ""  # 规格
+        demand = "" # 采购需求
+        budget = "" # 预算金额
+        order_time = "" # 采购时间
+
+        for i in range(min(4, len(items))):
+            it = items[i]
+            if len(it) < 15 and re.search(p1, it) != None:
+                flag = True
+                product = it
+                header_dic['名称'] = i
+                break
+        if not flag:
+            for i in range(min(4, len(items))):
+                it = items[i]
+                if len(it) < 15 and re.search(p2, it) and re.search(
+                        '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None:
+                    flag = True
+                    product = it
+                    header_dic['名称'] = i
+                    break
+        if flag:
+            for j in range(i + 1, len(items)):
+                if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
+                    continue
+                if header_dic['数量']=="" and re.search('数量', items[j]):
+                    header_dic['数量'] = j
+                    quantity = items[j]
+                elif re.search('单价', items[j]):
+                    header_dic['单价'] = j
+                    unitPrice = items[j]
+                elif re.search('品牌', items[j]):
+                    header_dic['品牌'] = j
+                    brand = items[j]
+                elif re.search('规格', items[j]):
+                    header_dic['规格'] = j
+                    specs = items[j]
+
+                elif re.search('需求', items[j]):
+                    header_dic['需求'] = j
+                    demand = items[j]
+                elif re.search('预算', items[j]):
+                    header_dic['预算'] = j
+                    budget = items[j]
+                elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
+                    header_dic['时间'] = j
+                    order_time = items[j]
+
+            if header_dic.get('名称', "") != "" :
+                num = 0
+                for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
+                    if it != "":
+                        num  += 1
+                if num >=2:
+                    return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
+        flag = False
+        return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
+
+    def predict(self, docid='', html='', page_time=""):
+        '''
+        正则寻找table表格内 产品相关信息
+        :param html:公告HTML原文
+        :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息
+        '''
+
+
+        soup = BeautifulSoup(html, 'lxml')
+        flag_yx = True if re.search('采购意向', html) else False
+        tables = soup.find_all(['table'])
+        headers = []
+        headers_demand = []
+        header_col = []
+        product_link = []
+        demand_link = []
+        total_product_money = 0
+        for i in range(len(tables)-1, -1, -1):
+            table = tables[i]
+            if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
+                table.string = table.get_text()
+                table.name = 'turntable'
+                continue
+            if not self.isTrueTable(table):
+                continue
+            self.fixSpan(table)
+            inner_table = self.getTable(table)
+            i = 0
+            found_header = False
+            header_colnum = 0
+            if flag_yx:
+                col0_l = []
+                col1_l = []
+                for tds in inner_table:
+                    if len(tds) == 2:
+                        col0_l.append(re.sub(':', '', tds[0]))
+                        col1_l.append(tds[1])
+                if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2:
+                    header_list2 = []
+                    product = demand = budget = order_begin = order_end = ""
+                    for i in range(len(col0_l)):
+                        if re.search('项目名称', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            product = col1_l[i]
+                        elif re.search('采购需求|需求概况', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            demand = col1_l[i]
+                        elif re.search('采购预算|预算金额', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            budget = col1_l[i]
+                            re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
+                            if re_price:
+                                budget = re_price[0]
+                                if '万元' in col0_l[i] and '万' not in budget:
+                                    budget += '万元'
+                                budget = str(getUnifyMoney(budget))
+                            else:
+                                budget = ""
+                        elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
+                            header_list2.append(col0_l[i])
+                            order_time = col1_l[i].strip()
+                            order_begin, order_end = self.fix_time(order_time, html, page_time)
+                    if order_begin != "" and order_end!="":
+                        order_begin_year = int(order_begin.split("-")[0])
+                        order_end_year = int(order_end.split("-")[0])
+                        # 限制附件错误识别时间
+                        if order_begin_year>=2050 or order_end_year>=2050:
+                            order_begin = order_end = ""
+                    if product!= "" and demand != "" and budget!="" and order_begin != "":
+                        link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
+                                'order_begin': order_begin, 'order_end': order_end}
+                        if link not in demand_link:
+                            demand_link.append(link)
+                            headers_demand.append('_'.join(header_list2))
+                        continue
+            while i < (len(inner_table)):
+                tds = inner_table[i]
+                not_empty = [it for it in tds if it != ""]
+                if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
+                    i += 1
+                    continue
+                product = ""  # 产品
+                quantity = ""  # 数量
+                unitPrice = ""  # 单价
+                brand = ""  # 品牌
+                specs = ""  # 规格
+                demand = ""  # 采购需求
+                budget = ""  # 预算金额
+                order_time = ""  # 采购时间
+                order_begin = ""
+                order_end = ""
+                if len(set(tds) & self.header_set) > len(tds) * 0.2:
+                    header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
+                    if found_header:
+                        headers.append('_'.join(header_list))
+                        headers_demand.append('_'.join(header_list2))
+                        header_colnum = len(tds)
+                        header_col.append('_'.join(tds))
+                    i += 1
+                    continue
+                elif found_header:
+                    if len(tds) != header_colnum:  # 表头、属性列数不一致跳过
+                        i += 1
+                        continue
+                    id1 = header_dic.get('名称', "")
+                    id2 = header_dic.get('数量', "")
+                    id3 = header_dic.get('单价', "")
+                    id4 = header_dic.get('品牌', "")
+                    id5 = header_dic.get('规格', "")
+
+                    id6 = header_dic.get('需求', "")
+                    id7 = header_dic.get('预算', "")
+                    id8 = header_dic.get('时间', "")
+                    if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
+                            re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
+                        product = tds[id1]
+                        if id2 != "":
+                            if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
+                                quantity = tds[id2]
+                            else:
+                                quantity = ""
+                        if id3 != "":
+                            if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
+                                unitPrice = tds[id3]
+                                re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?",unitPrice)
+                                if re_price:
+                                    unitPrice = re_price[0]
+                                    if '万元' in header_list[2] and '万' not in unitPrice:
+                                        unitPrice += '万元'
+                                    # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
+                                    unitPrice = str(getUnifyMoney(unitPrice))
+                                else:
+                                    unitPrice = ""
+                            else:
+                                unitPrice = ""
+                        if id4 != "":
+                            if re.search('\w', tds[id4]):
+                                brand = tds[id4]
+                            else:
+                                brand = ""
+                        if id5 != "":
+                            if re.search('\w', tds[id5]):
+                                specs = tds[id5]
+                            else:
+                                specs = ""
+                        if id6 != "":
+                            if re.search('\w', tds[id6]):
+                                demand = tds[id6]
+                            else:
+                                demand = ""
+                        if id7 != "":
+                            if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
+                                budget = tds[id7]
+                                re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
+                                if re_price:
+                                    budget = re_price[0]
+                                    if '万元' in header_list[2] and '万' not in budget:
+                                        budget += '万元'
+                                    budget = str(getUnifyMoney(budget))
+                                else:
+                                    budget = ""
+                            else:
+                                budget = ""
+                        if id8 != "":
+                            if re.search('\w', tds[id8]):
+                                order_time = tds[id8].strip()
+                                order_begin, order_end = self.fix_time(order_time, html, page_time)
+                        if quantity != "" or unitPrice != "" or brand != "" or specs != "":
+                            link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
+                                                      'brand': brand[:50], 'specs':specs}
+                            if link not in product_link:
+                                product_link.append(link)
+                                mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
+                                if link['unitPrice'] != "" and mat:
+                                    try:
+                                        total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', ''))
+                                    except:
+                                        log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
+                        if order_begin != "" and order_end != "":
+                            order_begin_year = int(order_begin.split("-")[0])
+                            order_end_year = int(order_end.split("-")[0])
+                            # 限制附件错误识别时间
+                            if order_begin_year >= 2050 or order_end_year >= 2050:
+                                order_begin = order_end = ""
+                        if budget != "" and order_time != "":
+                            link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
+                            if link not in demand_link:
+                                demand_link.append(link)
+                    i += 1
+                else:
+                    i += 1
+        if len(product_link)>0:
+            attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
+        else:
+            attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
+        if len(demand_link)>0:
+            demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
+        else:
+            demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
+        return [attr_dic, demand_dic], total_product_money
+
+    def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
+        if len(prem[0]['prem'])==1:
+            list_sentence = list_sentences[0]
+            list_entity = list_entitys[0]
+            _data = product_attrs[1]['demand_info']['data']
+            re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
+            order_times = []
+            for entity in list_entity:
+                if entity.entity_type=='time':
+                    sentence = list_sentence[entity.sentence_index]
+                    s = spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index,
+                                   end_index=entity.end_index,size=20)
+                    entity_left = "".join(s[0])
+                    if re.search(re_bidding_time,entity_left):
+                        time_text = entity.entity_text.strip()
+                        standard_time = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2})日?)")
+                        time_match = re.search(standard_time,time_text)
+                        if time_match:
+                            time_text = time_match.group()
+                        order_times.append(time_text)
+            # print(order_times)
+            order_times = [tuple(self.fix_time(order_time, html, page_time)) for order_time in order_times]
+            order_times = [order_time for order_time in order_times if order_time[0]!=""]
+            if len(set(order_times))==1:
+                order_begin,order_end = order_times[0]
+                project_name = codeName[0]['name']
+                pack_info = [pack for pack in prem[0]['prem'].values()]
+                budget = pack_info[0].get('tendereeMoney',0)
+                product = prem[0]['product']
+                link = {'project_name': project_name, 'product': product, 'demand': project_name, 'budget': budget,
+                        'order_begin': order_begin, 'order_end': order_end}
+                _data.append(link)
+            product_attrs[1]['demand_info']['data'] = _data
+        return product_attrs
+
+
+# docchannel类型提取
+class DocChannel():
+  def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
+    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
+    self.mask, self.mask_title = self.load_life(life_model)
+    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
+    self.type_mask, self.type_mask_title = self.load_type(type_model)
+    self.sequen_len = 200  # 150 200
+    self.title_len = 30
+    self.sentence_num = 10
+    self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
+
+    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
+    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+    self.id2type = {k: v for k, v in enumerate(lb_type)}
+    self.id2life = {k: v for k, v in enumerate(lb_life)}
+
+    self.load_pattern()
+
+  def load_pattern(self):
+      self.type_dic = {
+            '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
+            '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
+            '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
+            '采招数据': '(采购|招标|代理)(人|机构|单位)|(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' #|变更|答疑|澄清|中标|成交|合同|废标|流标
+        }
+
+      self.title_type_dic = {
+            '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
+            '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
+            '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
+            '采招数据': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)',  # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
+            '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
+        }
+      self.life_dic = {
+            '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
+            '招标预告': '预计(采购|招标)(时间|日期)',
+            '招标公告': '(采购|招标|竞选|报名)条件;报名时间;报名流程;报名方法;报名需提供的材料;参加竞价采购交易资格;(申请人|投标人|供应商|报价人|参选人)的?资格要求;获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件;(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
+            '资审结果': '招标资审公告|评审入围公示|资审及业绩公示|资格后审情况报告|资格后审结果公告|资格后审结果公示|资格预审结果公告|资格预审结果公示|预审公示|预审结果公示',
+            '招标答疑': '现澄清为|答疑澄清公告|异议的回复|(最高(投标)?限价|控制价|拦标价)公示',
+            '公告变更': '原公告(主要)?(信息|内容)|变更[前后]内容|现在?(变更|更正|修改|更改)为|(变更|更正)内容为|更正理由|更正人名称|[、\s](更正信息|更正内容):',
+            '候选人公示': '候选人公示|评标结果公示',
+            '中标信息': '供地结果信息|采用单源直接采购的?情况说明|现将\w{,4}(成交|中标|中选|选定结果|选取结果)\w{2,8}(进行公示|公示如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|(中标(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
+            '中标信息2': '(成交|中标)(日期|时间)[::\s]|成交金额:',
+            '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
+            '合同公告': '合同(公告|公示)信息;合同(公告|公示)日期;合同(公告|公示)内容;合同编号;合同名称;合同签订日期;合同主体;供应商乙方',
+            '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|(按|做|作)(流标|废标)处理)',
+        }
+      self.title_life_dic = {
+            '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
+            '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|供应计划$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
+            '公告变更': '(变更|更正(事项)?|更改|延期|暂停)的?(公告|公示|通知)|变更$|更正$',
+            '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)公示',
+            '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销|取消成交)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)',
+            '合同公告': '(合同(成交)?|履约验收|履约|验收结果)(公告|公示|信息|公式)|合同备案|合同书',  # 合同$|
+            '候选人公示': '候选人公示|评标(结果)?公示|中标前?公示|中标预公示',
+            '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)结果|开标(记录|信息|情况)|中标通知书|中标$',
+            # '资审结果': '(资质|资格)(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)(审查|预审)结果(公示)?|资审结果公示|未?入围(公示|公告)|资审及业绩公示',
+            '资审结果': '((资格|资质)(审查|预审|后审|审核|入围项?目?)|资审|入围)结果(公告|公示)?|(资质|资格)(预审|后审|入围)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|未?入围(公示|公告)|资审及业绩公示',
+            '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)',
+        }
+
+      self.wrong_win = '按项目控制价下浮\d%即为成交价|不得确定为(中标|成交)|招标人按下列原则选择中标人|确定成交供应商:|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)|竞拍起止时间:'
+
+  def load_life(self,life_model):
+    with tf.Graph().as_default() as graph:
+      output_graph_def = graph.as_graph_def()
+      with open(os.path.dirname(__file__)+life_model, 'rb') as f:
+        output_graph_def.ParseFromString(f.read())
+        tf.import_graph_def(output_graph_def, name='')
+        # print("%d ops in the final graph" % len(output_graph_def.node))
+        del output_graph_def
+        sess = tf.Session(graph=graph)
+        sess.run(tf.global_variables_initializer())
+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+        title = sess.graph.get_tensor_by_name('inputs/title:0')
+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
+        return sess, title, inputs, prob, softmax, mask, mask_title
+
+  def load_type(self,type_model):
+    with tf.Graph().as_default() as graph:
+      output_graph_def = graph.as_graph_def()
+      with open(os.path.dirname(__file__)+type_model, 'rb') as f:
+        output_graph_def.ParseFromString(f.read())
+        tf.import_graph_def(output_graph_def, name='')
+        # print("%d ops in the final graph" % len(output_graph_def.node))
+        del output_graph_def
+        sess = tf.Session(graph=graph)
+        sess.run(tf.global_variables_initializer())
+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+        title = sess.graph.get_tensor_by_name('inputs/title:0')
+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
+        return sess, title, inputs, prob, softmax, mask, mask_title
+
+  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
+    # print('准备预处理')
+    def get_kw_senten(s, span=10):
+      doc_sens = []
+      tmp = 0
+      num = 0
+      end_idx = 0
+      for it in re.finditer(self.kws, s):  # '|'.join(keywordset)
+        left = s[end_idx:it.end()].split()
+        right = s[it.end():].split()
+        tmp_seg = s[tmp:it.start()].split()
+        if len(tmp_seg) > span or tmp == 0:
+          doc_sens.append(' '.join(left[-span:] + right[:span]))
+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
+          tmp = it.end()
+          num += 1
+          if num >= self.sentence_num:
+            break
+      if doc_sens == []:
+        doc_sens.append(s)
+      return doc_sens
+
+    def word2id(wordlist, max_len=self.sequen_len):
+      ids = [getIndexOfWords(w) for w in wordlist]
+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
+      assert len(ids) == max_len
+      return ids
+
+    cost_time = dict()
+    datas = []
+    datas_title = []
+    try:
+      segword_title = ' '.join(selffool.cut(doctitle)[0])
+      segword_content = dochtmlcon
+    except:
+      segword_content = ''
+      segword_title = ''
+    if isinstance(segword_content, float):
+      segword_content = ''
+    if isinstance(segword_title, float):
+      segword_title = ''
+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
+    segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
+    segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
+    doc_word_list = segword_content.split()
+    if len(doc_word_list) > self.sequen_len / 2:
+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
+    else:
+      doc_sens = ' '.join(doc_word_list[:self.sequen_len])
+    # print('标题:',segword_title)
+    # print('正文:',segword_content)
+    datas.append(doc_sens.split())
+    datas_title.append(segword_title.split())
+    # print('完成预处理')
+    return datas, datas_title
+
+  def is_houxuan(self, title, content):
+    '''
+    通过标题和中文内容判断是否属于候选人公示类别
+    :param title: 公告标题
+    :param content: 公告正文文本内容
+    :return: 1 是候选人公示 ;0 不是
+    '''
+    if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
+      if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
+        return 0
+      return 1
+    if re.search('候选人的?公示', content[:100]):
+      if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
+        return 0
+      return 1
+    else:
+      return 0
+
+  def predict(self, title='', list_sentence='', web_source_no='', original_docchannel=''):
+    not_extract_dic = {
+        104: '招标文件',
+        106: '法律法规',
+        107: '新闻资讯',
+        108: '拟建项目',
+        109: '展会推广',
+        110: '企业名录',
+        111: '企业资质',
+        112: '全国工程人员',
+        113: '业主采购'
+    }
+    if original_docchannel in not_extract_dic:
+        return {'docchannel': {'docchannel':'', 'doctype':not_extract_dic[original_docchannel], "original_docchannel_id": str(original_docchannel)}}
+    if web_source_no in ['02104-7']:
+      return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}}
+
+    if isinstance(list_sentence, list):
+      token_l = [it.tokens for it in list_sentence]
+      tokens = [it for l in token_l for it in l]
+      content = ' '.join(tokens[:500])
+
+    title = re.sub('[^\u4e00-\u9fa5]', '', title)
+    if len(title)>50:
+        title = title[:20]+title[-30:]
+    data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
+    text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
+    title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
+    result = {'docchannel': {'docchannel':'', 'doctype':'', "original_docchannel_id": str(original_docchannel)}}
+
+    array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
+    array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
+    pred = self.type_sess.run(self.type_softmax,
+                                    feed_dict={
+                                              self.type_title: array_title,
+                                              self.type_content: array_content,
+                                              self.type_mask:[[0]*text_len+[1]*(self.sequen_len-text_len)],
+                                              self.type_mask_title:[[0]*title_len+[1]*(self.title_len-title_len)],
+                                              self.type_prob:1}
+                            )
+    id = np.argmax(pred, axis=1)[0]
+    prob = pred[0][id]
+    result['docchannel']['doctype'] = self.id2type[id]
+    # print('公告类别:', self.id2type[id], '概率:',prob)
+    # if id == 0:
+    if result['docchannel']['doctype'] not in ['', '新闻资讯']:
+      pred = self.lift_sess.run(self.lift_softmax,
+                                      feed_dict={
+                                                self.lift_title: array_title,
+                                                self.lift_content: array_content,
+                                                self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
+                                                self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
+                                                self.lift_prob:1}
+                              )
+      id = np.argmax(pred, axis=1)[0]
+      prob = pred[0][id]
+      result['docchannel']['docchannel'] = self.id2life[id]
+      # print('生命周期:纯模型预测',self.id2life[id], '概率:',prob)
+      # if id == 6:
+      if result['docchannel']['docchannel'] == '中标信息':
+        if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
+          result['docchannel']['docchannel'] = '候选人公示'
+          # return '候选人公示', prob
+          # return [{'docchannel': '候选人公示'}]
+
+    return result
+    #   return [{'docchannel':self.id2life[id]}]
+    # else:
+    #   # return self.id2type[id], prob
+    #   return [{'docchannel':self.id2type[id]}]
+
+  def predict_rule(self, title, content, channel_dic, prem_dic):
+      '''2022/2/10加入规则去除某些数据源及内容过短且不包含类别关键词的公告不做预测'''
+      hetong = '(合同|验收|履约)(公告|公示)|合同号?$'  # 合同标题正则
+      zhongbiao_t = '(中标|中选|成交|入选|入围|结果|确认)(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选)结果|开标(记录|信息|情况)|单一来源|直接(选取|选定)|中标通知书|中标$'
+      zhongbiao_c = '(中标|中选|成交|拟选用|拟邀请|最终选定的?|拟定)(供应商|供货商|服务商|企业|公司|单位|(候选)?人)(名称)?[::]|[,。:.](供应商|供货商|服务商)(名称)?:|指定的中介服务机构:|建设服务单位:'
+      zhaobiao_t = '(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈)(公告|公示|$)'
+      title_cn = re.sub('[^\u4e00-\u9fa5]', '', title)
+      if len(re.sub('[^\u4e00-\u9fa5]', "", content))<50 and channel_dic['docchannel']['doctype'] != '新闻资讯':
+          if re.search(hetong, title_cn) != None:
+              channel_dic['docchannel']['docchannel'] = '合同公告'
+          elif re.search(zhongbiao_t, title_cn):
+              channel_dic['docchannel']['docchannel'] = '中标信息'
+          elif re.search(zhaobiao_t, title_cn):
+              channel_dic['docchannel']['docchannel'] = '招标公告'
+          else:
+              channel_dic['docchannel']['docchannel'] = ''
+      elif channel_dic['docchannel'].get('docchannel', '') == '招标公告' and 'win_tenderer' in json.dumps(prem_dic,
+                                                                                              ensure_ascii=False):
+          if re.search(hetong, title_cn) != None:
+              channel_dic['docchannel']['docchannel'] = '合同公告'
+              log('正则把招标公告修改为合同公告')
+          elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
+                                                                                                      content):
+              channel_dic['docchannel']['docchannel'] = '中标信息'
+              log('正则把招标公告修改为中标信息')
+      elif channel_dic['docchannel'].get('docchannel', '') == '中标信息' and 'win_tenderer' not in json.dumps(prem_dic,
+                                                                                                    ensure_ascii=False):
+          if re.search(hetong, title_cn):
+              channel_dic['docchannel']['docchannel'] = '合同公告'
+              log('正则把中标信息修改为合同公告')
+          elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
+                                                                                                      content):
+              pass
+          elif re.search(zhaobiao_t, title_cn):
+              channel_dic['docchannel']['docchannel'] = '招标公告'
+              log('正则把中标信息修改为招标公告')
+          elif re.search('中标|成交|中选|入选|入围|结果|供应商|供货商|候选人', title_cn+content)==None:
+              channel_dic['docchannel']['docchannel'] = ''
+              log('正则把中标信息修改为空')
+      return channel_dic
+
+  def predict_merge(self, title, list_sentence, html, bidway, prem, original_docchannel='', web_source_no=''):
+      '''
+      正则,模型混合预测,返回公告类型及生命周期
+      :param title:  公告标题
+      :param content: 预处理后的返回的句子实体列表 list_sentence
+      :param html: 公告原文 html 内容
+      :param bidway: 招标方式
+      :param prem: 提取的prem 字典
+      :return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
+      '''
+      def cut_single_cn_space(text):
+          new_text = ""
+          for w in text.split():
+              if len(w) == 1 or re.search('^[\u4e00-\u9fa5][::]', w):
+                  new_text += w
+              else:
+                  new_text += ' ' + w
+          return new_text
+
+      def html2text(html):
+          ser = re.search('<div[^<>]*richTextFetch', html)
+          if ser:
+              html = html[:ser.start()]+'##richTextFetch##'
+          text = re.sub('<[^<]*?>', '', html).replace('&nbsp;', ' ')
+          text = re.sub('\s+', ' ', text)
+          text = re.sub('[/|[()()]', '', text)
+          text = cut_single_cn_space(text)
+          return text[:20000]
+
+      def count_diffser(pattern, text):
+          num = 0
+          kw = []
+          for p in pattern.split(';'):
+              if re.search(p, text):
+                  num += 1
+                  kw.append(re.search(p, text).group(0))
+          return num, ';'.join(kw)
+
+      def is_contain_winner(extract_json):
+          if re.search('win_tenderer', extract_json):
+              return True
+          else:
+              return False
+
+      def is_single_source(bidway, title):
+          if re.search('单一来源|单一性采购', title):
+              return True
+          elif bidway == '单一来源':
+              return True
+          else:
+              return False
+
+      def get_type(title, text):
+          if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'],
+                                                                   text):  # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None
+              if re.search(self.title_type_dic['采招数据'], title + text[:50]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
+              return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0)
+          elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
+              if re.search(self.title_type_dic['采招数据'], title + text[:50]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
+              return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
+          elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text):
+              if re.search(self.title_type_dic['采招数据'], title + text[:50]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
+              return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0)
+          elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text):
+              return '采招数据', (
+                          re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group(
+                  0)
+          elif re.search(self.title_type_dic['新闻资讯'], title):
+              if re.search(self.title_type_dic['采招数据'], title + text[:150]):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:150]).group(0)
+              return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0)
+          else:
+              return '', '没有公告类型关键词,返回空'
+
+      def get_life(title, text, extract_json="", bidway="",  original_docchannel=''):
+          if re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100]):
+              if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
+                  return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
+                      0)
+              elif re.search(self.title_life_dic['候选人公示'], title):
+                  return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
+              elif re.search(self.title_life_dic['中标信息'], title):
+                  return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
+              elif re.search('终止|废标|流标', title):
+                  return '废标公告', re.search('终止|废标|流标', title).group(0)
+              elif is_single_source(bidway, title):
+                  return '中标信息', 'bidway单一来源'
+              return '采购意向', (
+                          re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100])).group(0)
+          elif re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text):
+              if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
+                  return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
+                      0)
+              elif re.search(self.title_life_dic['候选人公示'], title):
+                  return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
+              elif re.search(self.title_life_dic['中标信息'], title):
+                  return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
+              elif re.search('终止|废标|流标', title):
+                  return '废标公告', re.search('终止|废标|流标', title).group(0)
+              elif is_single_source(extract_json, title):
+                  return '中标信息', 'bidway单一来源'
+              return '招标预告', (re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text)).group(0)
+          elif re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
+              if re.search(self.title_life_dic['废标公告'], title):
+                  return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
+              #         elif re.search('(中标|成交)结果', title[-8:]):
+              #             return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)       
+              return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(0)
+          elif re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or len(
+                  re.findall('(答:|回复:)', text)) >= 2:  # or re.search(self.title_life_dic['招标答疑'], text[:150])
+              if re.search(self.title_life_dic['废标公告'], title):
+                  return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
+              elif re.search('(中标|成交)结果', title[-8:]):
+                  return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
+              return '招标答疑', (
+                          re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or re.search(
+                      '(答:|回复:)', text)).group(0)
+          elif re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150]):
+              return '废标公告', (
+                          re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150])).group(0)
+          elif re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150]):
+              if re.search('候选人|公示期?(已?满|已经?结束)|中标(结果|公告)', text) == None:
+                  return '中标信息', '候选人公示排除,修改为中标信息'
+              return '候选人公示', (
+                          re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150])).group(
+                  0)
+          elif re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'], text[
+                                                                                             :150]):
+              return '合同公告', (re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'],
+                                                                                    text[:150]) or re.search(
+                  self.life_dic['合同公告'], text)).group(0)
+          elif re.search(self.life_dic['合同公告'].replace(';', '|'), text):  # or re.search(self.life_dic['合同公告'], text[:300]):
+              num, kw = count_diffser(self.life_dic['合同公告'], text)
+              if num >= 3:
+                  return '合同公告', kw
+              elif re.search(self.title_life_dic['招标公告'], title[-8:]):
+                  return '招标公告', re.search(self.title_life_dic['招标公告'], title[-8:]).group(0)
+              elif not is_contain_winner(extract_json):
+                  return '', '有合同关键词无中标角色返回空'
+              return '合同公告', re.search(self.life_dic['合同公告'].replace(';', '|'), text).group(0)
+          elif is_single_source(extract_json, title):
+              return '中标信息', '单一来源采购'
+          elif re.search(self.title_life_dic['中标信息'], title):
+              if re.search(self.title_life_dic['资审结果'], title+text[:150]):
+                  return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
+              return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
+          elif re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:]):
+              if re.search(self.title_life_dic['资审结果'], title+text[:150]):
+                  return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
+              # if re.search(self.wrong_win, text):
+              #     return '招标公告', re.search(self.wrong_win, text).group(0)
+              return '中标信息', (
+                          re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:])).group(
+                  0)
+          elif re.search(self.life_dic['中标信息2'], text[:]):
+              if re.search(self.wrong_win, text):
+                  return '招标公告', re.search(self.wrong_win, text).group(0)
+              return '中标信息', re.search(self.life_dic['中标信息2'], text[:]).group(0)
+          elif re.search(self.life_dic['中标信息3'], text[:]) and is_contain_winner(extract_json):
+              if re.search(self.wrong_win, text):
+                  return '招标公告', re.search(self.wrong_win, text).group(0)
+              return '中标信息', re.search(self.life_dic['中标信息3'], text[:]).group(0)
+          elif re.search('公开选取.{,20}机构的公告', title):
+              if re.search('(中标|成交|中选)(中介|服务)?机构(名称)?[::\s]', text):
+                  return '中标信息', '机构选取有中选机构'
+              else:
+                  return '招标公告', '公开选取机构'
+          elif is_contain_winner(extract_json):
+              num, kw = count_diffser(self.life_dic['招标公告'], text)
+              if re.search(self.wrong_win, text):
+                  return '招标公告', re.search(self.wrong_win, text).group(0)
+              elif num >= 2:
+                  return '招标公告', kw
+              elif re.search('##richTextFetch##', text):
+                  return '', '提取到中标人但包含附件返回空'
+              return '中标信息', '提取到中标人'
+          elif re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:]):
+              return '资审结果', (re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:])).group(0)
+          elif re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'), text[:]):
+              if re.search('意向|预告|变更|更正|中标|中选|成交|答疑|废标|流标|终止', title):
+                  return '', '招标正则召回标题有其他类别关键词,返回空'
+              return '招标公告', (re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'),
+                                                                                    text[:])).group(0)
+          else:
+              return '', '未预测到关键词, 返回空'
+
+      not_extract_dic = {
+          104: '招标文件',
+          106: '法律法规',
+          107: '新闻资讯',
+          108: '拟建项目',
+          109: '展会推广',
+          110: '企业名录',
+          111: '企业资质',
+          112: '全国工程人员',
+          113: '业主采购'
+      }
+      if original_docchannel in not_extract_dic:
+          return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel]}}
+      if web_source_no in ['02104-7', '04733']: # 这些数据源无法识别
+          return {'docchannel': {'docchannel': '', 'doctype': '采招数据'}}
+
+      title = re.sub('[^\u4e00-\u9fa5]', '', title)
+      if len(title) > 50:
+          title = title[:20] + title[-30:]
+
+      text = html2text(html)
+      prem_json = json.dumps(prem, ensure_ascii=False)
+      result = {'docchannel': {'docchannel': '', 'doctype': ''}}
+
+      doc_type, type_kw = get_type(title, text)
+      doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
+      if doc_type in self.title_type_dic:
+          result['docchannel']['doctype'] = doc_type
+      if doc_life in self.title_life_dic:
+          result['docchannel']['docchannel'] = doc_life
+
+      if doc_type=="" or doc_life=="":
+          list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
+          token_l = [it.tokens for it in list_sentence]
+          tokens = [it for l in token_l for it in l]
+          content = ' '.join(tokens[:500])
+          data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
+                                                          dochtmlcon=content)  # 标题最多取50字
+          text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
+          title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
+
+          array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
+          array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
+
+          if doc_type == "":
+              pred = self.type_sess.run(self.type_softmax,
+                                        feed_dict={
+                                            self.type_title: array_title,
+                                            self.type_content: array_content,
+                                            self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
+                                            self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
+                                            self.type_prob: 1}
+                                        )
+              id = np.argmax(pred, axis=1)[0]
+              prob = pred[0][id]
+              result['docchannel']['doctype'] = self.id2type[id]
+              # print('公告类别:', self.id2type[id], '概率:',prob)
+              # if id == 0:
+          if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
+              if len(text)>150 and re.search(self.kws, content):
+                  pred = self.lift_sess.run(self.lift_softmax,
+                                            feed_dict={
+                                                self.lift_title: array_title,
+                                                self.lift_content: array_content,
+                                                self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
+                                                self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
+                                                self.lift_prob: 1}
+                                            )
+                  id = np.argmax(pred, axis=1)[0]
+                  prob = pred[0][id]
+                  if self.id2life[id] == '中标信息' and original_docchannel in [52, '52', '招标公告'] and not is_contain_winner(prem_json):
+                      result['docchannel']['docchannel'] = '招标公告'
+                  else:
+                      result['docchannel']['docchannel'] = self.id2life[id]
+                      # print('生命周期:',self.id2life[id], '概率:',prob)
+                      # if id == 6:
+                      if result['docchannel']['docchannel'] == '中标信息':
+                          if self.is_houxuan(''.join([it for it in title if it.isalpha()]),
+                                             ''.join([it for it in content if it.isalpha()])):
+                              result['docchannel']['docchannel'] = '候选人公示'
+                              # return '候选人公示', prob
+                              # return [{'docchannel': '候选人公示'}]
+      # print('公告类型:%s, 生命周期:%s, 关键词:%s '%(doc_type, doc_life, life_kw))
+      # print('result: ', result)
+      return result
+
+# 保证金支付方式提取
+class DepositPaymentWay():
+    def __init__(self,):
+        self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[::]*([^,。]{,60})'
+        self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式'
+        kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)',
+               '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码',
+               '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出',
+               '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函']
+        self.kws = sorted(kws, key=lambda x: len(x), reverse=True)
+
+    def predict(self,content):
+        pay_way = {'deposit_patment_way':''}
+        result = []
+        pay = re.search(self.pt, content)
+        if pay:
+            # print(pay.group(0))
+            pay = pay.group(3)
+            for it in re.finditer('|'.join(self.kws), pay):
+                result.append(it.group(0))
+            pay_way['deposit_patment_way'] = ';'.join(result)
+            return pay_way
+        pay = re.search(self.pt2, content)
+        if pay:
+            # print(pay.group(0))
+            pay = pay.group(2)
+            for it in re.finditer('|'.join(self.kws), pay):
+                result.append(it.group(0))
+            pay_way['deposit_patment_way'] = ';'.join(result)
+            return pay_way
+        else:
+            return pay_way
+
+
+# 总价单价提取
+class TotalUnitMoney:
+    def __init__(self):
+        pass
+
+    def predict(self, list_sentences, list_entitys):
+        for i in range(len(list_entitys)):
+            list_entity = list_entitys[i]
+
+            # 总价单价
+            for _entity in list_entity:
+                if _entity.entity_type == 'money':
+                    word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
+                    # 总价在中投标金额中
+                    if _entity.label == 1:
+                        result = extract_total_money(word_of_sentence,
+                                                     _entity.entity_text,
+                                                     [_entity.wordOffset_begin, _entity.wordOffset_end])
+                        if result:
+                            _entity.is_total_money = 1
+
+                    # 单价在普通金额中
+                    else:
+                        result = extract_unit_money(word_of_sentence,
+                                                    _entity.entity_text,
+                                                    [_entity.wordOffset_begin, _entity.wordOffset_end])
+                        if result:
+                            _entity.is_unit_money = 1
+                # print("total_unit_money", _entity.entity_text,
+                #       _entity.is_total_money, _entity.is_unit_money)
+
+
+def getSavedModel():
+    #predictor = FormPredictor()
+    graph = tf.Graph()
+    with graph.as_default():
+        model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
+        
+        #print(tf.graph_util.remove_training_nodes(model))
+        tf.saved_model.simple_save(
+          tf.keras.backend.get_session(),
+          "./h5_savedmodel/",
+          inputs={"image": model.input},
+          outputs={"scores": model.output}
+        )
+        
+def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
+    '''
+    model = models.Sequential()
+    model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True))  # Random embedding
+    model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
+    crf = CRF(len(chunk_tags), sparse_target=True)
+    model.add(crf)
+    model.summary()
+    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
+    return model
+    '''
+    input = layers.Input(shape=(None,),dtype="int32")
+    if weights is not None:
+        embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
+    else:
+        embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
+    bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
+    bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
+    crf = CRF(len(chunk_tags),sparse_target=True)
+    crf_out = crf(bilstm_dense)
+    model = models.Model(input=[input],output = [crf_out])
+    model.summary()
+    model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
+    return model
+
+
+import h5py
+def h5_to_graph(sess,graph,h5file):
+    
+    f = h5py.File(h5file,'r')   #打开h5文件 
+    def getValue(v):
+        _value = f["model_weights"]
+        list_names = str(v.name).split("/")
+        for _index in range(len(list_names)):
+            print(v.name)
+            if _index==1:
+                _value = _value[list_names[0]]
+            _value = _value[list_names[_index]]
+        return _value.value
+            
+    def _load_attributes_from_hdf5_group(group, name):
+        """Loads attributes of the specified name from the HDF5 group.
+    
+        This method deals with an inherent problem
+        of HDF5 file which is not able to store
+        data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
+    
+        # Arguments
+            group: A pointer to a HDF5 group.
+            name: A name of the attributes to load.
+    
+        # Returns
+            data: Attributes data.
+        """
+        if name in group.attrs:
+            data = [n.decode('utf8') for n in group.attrs[name]]
+        else:
+            data = []
+            chunk_id = 0
+            while ('%s%d' % (name, chunk_id)) in group.attrs:
+                data.extend([n.decode('utf8')
+                            for n in group.attrs['%s%d' % (name, chunk_id)]])
+                chunk_id += 1
+        return data
+    
+    def readGroup(gr,parent_name,data):
+        for subkey in gr:
+            print(subkey)
+            if parent_name!=subkey:
+                if parent_name=="":
+                    _name = subkey
+                else:
+                    _name = parent_name+"/"+subkey
+            else:
+                _name = parent_name
+            if str(type(gr[subkey]))=="<class 'h5py._hl.group.Group'>":
+                readGroup(gr[subkey],_name,data)
+            else:
+                data.append([_name,gr[subkey].value])
+                print(_name,gr[subkey].shape)
+                
+    
+    layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names')
+    list_name_value = []
+    readGroup(f["model_weights"], "", list_name_value)
+    '''
+    for k, name in enumerate(layer_names):
+        g = f["model_weights"][name]
+        weight_names = _load_attributes_from_hdf5_group(g, 'weight_names')
+        #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
+        for weight_name in weight_names:
+            list_name_value.append([weight_name,np.asarray(g[weight_name])])
+    '''
+    for name_value in list_name_value:
+        name = name_value[0]
+        '''
+        if re.search("dense",name) is not None:
+            name = name[:7]+"_1"+name[7:]
+        '''
+        value = name_value[1]
+        print(name,graph.get_tensor_by_name(name),np.shape(value))
+        sess.run(tf.assign(graph.get_tensor_by_name(name),value))
+
+
+def initialize_uninitialized(sess):
+    global_vars          = tf.global_variables()
+    is_not_initialized   = sess.run([tf.is_variable_initialized(var) for var in global_vars])
+    not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
+    
+    adam_vars = []
+    for _vars in not_initialized_vars:
+        if re.search("Adam",_vars.name) is not None:
+            adam_vars.append(_vars)
+ 
+    print([str(i.name) for i in adam_vars]) # only for testing
+    if len(adam_vars):
+        sess.run(tf.variables_initializer(adam_vars))
+    
+      
+def save_codename_model():
+    # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
+    filepath = "../projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
+    vocabpath = "../projectCode/models/vocab.pk"
+    classlabelspath = "../projectCode/models/classlabels.pk"
+    # vocab = load(vocabpath)
+    # class_labels = load(classlabelspath)
+    w2v_matrix = load('codename_w2v_matrix.pk')
+    graph = tf.get_default_graph()
+    with graph.as_default() as g:
+        ''''''
+        # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
+        #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
+        
+        sess = tf.Session(graph=g)
+        # sess = tf.keras.backend.get_session()
+        char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
+        #with sess.as_default():
+        sess.run(tf.global_variables_initializer())
+        # print(sess.run("time_distributed_1/kernel:0"))
+        # model.load_weights(filepath)
+        saver = tf.train.Saver()
+        saver.restore(sess, filepath)
+
+        # print("logits",sess.run(logits))
+        
+        # print("#",sess.run("time_distributed_1/kernel:0"))
+
+        # x = load("codename_x.pk")
+        #y = model.predict(x)
+        # y = sess.run(model.output,feed_dict={model.input:x})
+        
+        # for item in np.argmax(y,-1):
+        #     print(item)
+        tf.saved_model.simple_save(
+                                    sess,
+                                    "./codename_savedmodel_tf/",
+                                    inputs={"inputs": char_input,
+                                            "inputs_length":length,
+                                            'keepprob':keepprob},
+                                    outputs={"logits": logits,
+                                             "trans":trans}
+        )
+        
+    
+def save_role_model():
+    '''
+    @summary: 保存model为savedModel,部署到PAI平台上调用
+    '''
+    model_role = PREMPredict().model_role
+    with model_role.graph.as_default():
+        model = model_role.getModel()
+        sess = tf.Session(graph=model_role.graph)
+        print(type(model.input))
+        
+        sess.run(tf.global_variables_initializer())
+        h5_to_graph(sess, model_role.graph, model_role.model_role_file)
+        model = model_role.getModel()
+        
+        tf.saved_model.simple_save(sess,
+                                   "./role_savedmodel/",
+                                   inputs={"input0":model.input[0],
+                                           "input1":model.input[1],
+                                           "input2":model.input[2]},
+                                   outputs={"outputs":model.output}
+                                   )
+
+
+def save_money_model():
+    model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
+    graph = tf.Graph()
+    with graph.as_default():
+
+        sess = tf.Session(graph=graph)
+
+        with sess.as_default():
+            # model = model_money.getModel()
+            # model.summary()
+            # sess.run(tf.global_variables_initializer())
+            # h5_to_graph(sess, model_money.graph, model_money.model_money_file)
+
+            model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
+            model.summary()
+            print(model.weights)
+            tf.saved_model.simple_save(sess,
+                                       "./money_savedmodel2/",
+                                       inputs = {"input0":model.input[0],
+                                                 "input1":model.input[1],
+                                                 "input2":model.input[2]},
+                                       outputs = {"outputs":model.output}
+                                       )
+    
+
+def save_person_model():
+    model_person = EPCPredict().model_person
+    with model_person.graph.as_default():
+        
+        x = load("person_x.pk")
+        _data = np.transpose(np.array(x),(1,0,2,3))
+        model = model_person.getModel()
+        
+        sess = tf.Session(graph=model_person.graph)
+        with sess.as_default():
+            
+            sess.run(tf.global_variables_initializer())
+            model_person.load_weights()
+        
+        
+        #h5_to_graph(sess, model_person.graph, model_person.model_person_file)
+        
+        predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]})
+        #predict_y = model.predict([_data[0],_data[1]])
+        print(np.argmax(predict_y,-1))
+        
+        tf.saved_model.simple_save(sess,
+                                   "./person_savedmodel/",
+                                   inputs={"input0":model.input[0],
+                                           "input1":model.input[1]},
+                                   outputs = {"outputs":model.output})
+    
+def save_form_model():
+    model_form = FormPredictor()
+    with model_form.graph.as_default():
+        model = model_form.getModel("item")
+        sess = tf.Session(graph=model_form.graph)
+        sess.run(tf.global_variables_initializer())
+        h5_to_graph(sess, model_form.graph, model_form.model_file_item)
+        tf.saved_model.simple_save(sess,
+                                   "./form_savedmodel/",
+                                   inputs={"inputs":model.input},
+                                   outputs = {"outputs":model.output})
+    
+def save_codesplit_model():
+    filepath_code = "../projectCode/models/model_code.hdf5"
+    
+    
+    graph = tf.Graph()
+    with graph.as_default():
+        model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
+        sess = tf.Session()
+        sess.run(tf.global_variables_initializer())
+        h5_to_graph(sess, graph, filepath_code)
+        tf.saved_model.simple_save(sess,
+                                   "./codesplit_savedmodel/",
+                                   inputs={"input0":model_code.input[0],
+                                           "input1":model_code.input[1],
+                                           "input2":model_code.input[2]},
+                                   outputs={"outputs":model_code.output})
+
+def save_timesplit_model():
+    filepath = '../time/model_label_time_classify.model.hdf5'
+    with tf.Graph().as_default() as graph:
+        time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
+        with tf.Session() as sess:
+            sess.run(tf.global_variables_initializer())
+            h5_to_graph(sess, graph, filepath)
+            tf.saved_model.simple_save(sess,
+                                       "./timesplit_model/",
+                                       inputs={"input0":time_model.input[0],
+                                               "input1":time_model.input[1]},
+                                       outputs={"outputs":time_model.output})
+
+
+if __name__=="__main__":
+    #save_role_model()
+    # save_codename_model()
+    # save_money_model()
+    #save_person_model()
+    #save_form_model()
+    #save_codesplit_model()
+    # save_timesplit_model()
+    '''
+    # with tf.Session(graph=tf.Graph()) as sess:
+    #     from tensorflow.python.saved_model import tag_constants
+    #     meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
+    #     graph = tf.get_default_graph()
+    #     signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    #     signature = meta_graph_def.signature_def
+    #     input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
+    #     input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
+    #     outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
+    #     x = load("person_x.pk")
+    #     _data = np.transpose(x,[1,0,2,3])
+    #     y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
+    #     print(np.argmax(y,-1))
+    '''
+
+    MAX_LEN = 1000
+    vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
+    vocab = load(vocabpath)
+    word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
+    index_unk = word2index.get("<unk>")
+    sentence = "招标人:广州市重点公共建设项目管理中心,联系人:李工,联系方式:020-22905689,招标代理:广东重工建设监理有限公司," \
+               "代理联系人:薛家伟,代理联系方式:13535014481,招标监督机构:广州市重点公共建设项目管理中心,监督电话:020-22905690," \
+               "备注:以上为招标公告简要描述,招标公告详细信息请查看“招标公告”附件,"
+    sentence = sentence*5
+    list_sentence = [sentence]*200
+    # print(list_sentence)
+    x = [[word2index.get(word, index_unk) for word in sentence] for sentence in
+         list_sentence]
+    x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
+    # print(x_len)
+    x = pad_sequences(x, maxlen=MAX_LEN, padding="post", truncating="post")
+
+    requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},
+                                    verify=True)
+    # predict_y = json.loads(requests_result.text)['result']
+    print("cost_time:", json.loads(requests_result.text)['cost_time'])
+    print(MAX_LEN, len(sentence), len(list_sentence))
+    requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},
+                                    verify=True)
+    # predict_y = json.loads(requests_result.text)['result']
+    print("cost_time:", json.loads(requests_result.text)['cost_time'])
+    print(MAX_LEN, len(sentence), len(list_sentence))

+ 325 - 0
BiddingKG/re_servicetime.py

@@ -0,0 +1,325 @@
+#coding:UTF-8
+import re
+import pandas as pd
+from bs4 import BeautifulSoup
+
+TEST_MODE = False
+
+
+# before = '(?P<before>' \
+#          '合同期限|工期/交货期/服务期|工期,\(日历天\)|工期\(交货期\)|合格工期\(天\)|服务期限\(年\)|工期\(天\)' \
+#          '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
+#          '|合格工期|计划工期\(服务期\)|服务期\(日历天\)|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
+#          '|交货时间|工期\(日历天\)' \
+#          '|服务期限为|计划工期|工期要求|服务期限|服务期' \
+#          '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期|服务要求' \
+#          '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|工期|供货期|合同履行日期|计划周期|工期' \
+#          ')'
+
+before = '(?P<before>' \
+         '合同期限|工期/交货期/服务期|工期,|工期\(交货期\)|合格工期|服务期限|工期' \
+         '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
+         '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
+         '|交货时间|工期|质保期' \
+         '|服务期限为|计划工期|工期要求|服务期限|服务期' \
+         '|投标工期|设计工期|合格服务周期|总工期|服务时间|流转期限|维护期限|服务时限|交货期' \
+         '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|供货期|合同履行日期|计划周期' \
+         '|履约期限|合同约定完成时限|合同完成日期' \
+         ')'
+
+
+# ^(?!.*abc).*$ 排除abc字符串
+before_wuye = '(?P<before>' \
+              '(履约期限、地点等简要信息[::])|(履约期限、地点等简要信息.{0,25}(?= [\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+([年月日]|个月)|20[21]))' \
+              ')'
+
+before2 = '(?P<before2>' \
+          '自合同签订之日起至|合同签订之日起|自合同签订之日起|开工后|不超过|签订合同后|系统开发' \
+          '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
+          '|[自从]?合同签[订定]生效之日起|自合同签订后不超过|中选后|均为|合同签订日至|合同期' \
+          '|.{0,1}合同签订.{0,3}|计划|从|合同签订生效之日起|本项目招标有效期' \
+          '|[自从]?签[订定]合同(之日|后).{1,4}|[自从]?(采购)?合同签[订定](之日|后|).{1,5}|签订合同起' \
+          '|项目的有效期限为|项目服务为|签订合同期为|合同签[订定]生效之日.{1,4}' \
+          '|[自从]服务合同生效之日.{1,4}|[自从].{2,15}之日.{1,4}|(本次)?采购周期' \
+          '|(项目招标)?履行期|[自从于]?合同生效之日.{1,3}|' \
+          ')'
+
+before3 = '(?P<before3>' \
+          '([\((]日历天[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
+          ')'
+
+charac = '(?P<charac>' \
+         '[::,,【()】为是起暂定的有效期限]*' \
+         ')'
+
+center = '(?P<center>' \
+         '[自为约是起暂定的拟期从]{0,3}(\d{2,4}[-.年/]?\d{1,2}[-.月/]?\d{1,2}[日号]?[-~~起至—]+(\d{2,4}[-.年/]?)?\d{1,2}[-.月/]?\d{1,2}[日号]?|\d{2,4}年\d{1,2}月\d{1,2}[日号]|[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+)' \
+         ')'
+
+number = '(?P<number>' \
+         '[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+' \
+         ')'
+
+after = '(?P<after>' \
+        '周年|周|号|天|个月|个年|年|个日历天|日历天|日|\(日历天\)|\(天\)|周内|,日历天|工作日|个工作日|' \
+        ')'
+
+after1 = '(?P<after1>' \
+         '[自为约是起暂定的拟从]{0,3}\d{4}[-年/]?(\d{1,2}[-月/]?)?(\d{1,2}[日号]?)?[-~~起至—]+(\d{4}[-年/]?)?(\d{1,2}[-月/]?)?(\d{1,2}日?)?(-\d{1,2}[日号]?)?([】)]?)' \
+         ')'
+
+after2 = '(?P<after2>' \
+         '\d+' \
+         ')'
+
+after3 = '(?P<after3>' \
+         '(.{0,25}止([\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾][年月日])?)' \
+         ')'
+
+reg = re.compile(before + before3 + charac + before2 + center + after)
+
+reg1 = re.compile(before + before3 + charac + after3)
+
+reg2 = re.compile(before + before3 + charac + before2 + after1)
+
+reg3 = re.compile(before + before3 + charac + before2 + after2)
+
+reg4 = re.compile(before2[:-2]+before2[-1:] + center + after)
+
+# reg4 = re.compile(before2[:-2]+before2[-1:] + number + after)
+# print(before2[:-2]+before2[-1:])
+
+reg_wuye = re.compile(before_wuye + center + after)
+
+reg_not = re.compile(u'(工期延误|工期节点|工期管理|交付使用'
+                     u'|工期、)'
+                     u'|工期情况|划工期内|服务期内')
+
+reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,'
+                      u'|务期限:1、|签订日期|证金在合同签|服务期限截止'
+                      u')')
+
+# reg_not2 = re.compile(u'(截止|1\\.|1、)')
+reg_not2 = re.compile(u'(截止)')
+
+reg_right_digit = re.compile(u'[\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+')
+
+reg_right_unit = re.compile(u'[-.年月日号天~~至—]')
+
+reg_error = re.compile(u'公告|发布|中')
+
+
+def re_serviceTime(text):
+    if TEST_MODE:
+        # print(chardet.detect(text))
+        text = re.sub("\s*", "", text)
+
+    text_list = []
+    text_list.append(text)
+    # 初始化
+    all_output_list = []
+    all_text_index_list = []
+
+    for index in range(len(text_list)):
+        # 初始化
+        output_list = []
+        input_str = text_list[index]
+
+        # 替换混淆词
+        for _reg_not in [reg_not, reg_not1, reg_not2]:
+            match_iter = re.finditer(_reg_not, input_str)
+            for match in match_iter:
+                word_index = match.span()
+                word = match.group()
+                instead = "#" * len(word)
+                print("word, instead, word_index", word, instead, word_index)
+                input_str = input_str[:word_index[0]] + instead + input_str[word_index[1]:]
+
+        if TEST_MODE:
+            print("input_str", input_str)
+
+        # 匹配
+        output_list, text_index_list = re_findAllResult(reg2, input_str)
+        if TEST_MODE:
+            print("output_str, text_index reg2", output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg", output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg1, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg1", output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg3, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg3", output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg4, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg4", output_list, text_index_list)
+
+        if len(output_list) == 0:
+            output_list, text_index_list = re_findAllResult(reg_wuye, input_str)
+            if TEST_MODE:
+                print("output_str, text_index reg_wuye", output_list, text_index_list)
+
+        # 过滤
+        delete_list = []
+        for i in range(len(output_list)):
+            output = output_list[i]
+            # 不包含数字、单位的
+            if not re.findall(reg_right_digit, output):
+                delete_list.append([output, text_index_list[i]])
+                continue
+            if not re.findall(reg_right_unit, output):
+                delete_list.append([output, text_index_list[i]])
+                continue
+            # 包含不要的字
+            if re.findall(reg_error, output):
+                delete_list.append([output, text_index_list[i]])
+                continue
+            # 类似2019年的
+            if not re.findall("[月日天号]", output):
+                if len(re.findall("年", output)) == 1:
+                    year_time = re.search("\d+", output)
+                    if year_time is not None and int(year_time.group()) >= 2000:
+                        print("delete output", output)
+                        delete_list.append([output, text_index_list[i]])
+        for output, text_index in delete_list:
+            if output in output_list:
+                output_list.remove(output)
+            if text_index in text_index_list:
+                text_index_list.remove(text_index)
+
+        # 添加
+        all_output_list += output_list
+        all_text_index_list += text_index_list
+
+    index2word = []
+    for i in range(len(all_text_index_list)):
+        word = text[all_text_index_list[i][0]:all_text_index_list[i][1]]
+        if i != len(all_text_index_list)-1:
+            word = word + " "
+        index2word.append(word)
+
+    if TEST_MODE:
+        print("index2word all_text_index_list", index2word, all_text_index_list)
+    return index2word, all_text_index_list
+
+
+def re_findAllResult(reg, input, unit="", index=0):
+    """
+
+    :param reg: 正则表达式
+    :param input: 待匹配句子
+    :param unit: 需要加的单位
+    :param index: 字符串拼接的开始位置
+    :return: 正则后的字符串
+    """
+    # 全文下标
+    text_index = []
+    match1 = re.finditer(reg, input)
+    output_list = []
+    for i in match1:
+        output = ""
+        d = i.groupdict()
+        if d.get("before"):
+            output += d.get("before")
+        if d.get("before3"):
+            output += d.get("before3")
+        if d.get("charac"):
+            output += d.get("charac")
+        if d.get("before2"):
+            output += d.get("before2")
+        if d.get("center"):
+            output += d.get("center")
+        if d.get("number"):
+            output += d.get("number")
+        if d.get("after"):
+            output += d.get("after")
+        if d.get("after1"):
+            output += d.get("after1")
+        if d.get("after2"):
+            output += d.get("after2")
+        if d.get("after3"):
+            output += d.get("after3")
+
+        if d.get("before") is not None:
+            if d.get("before3") is None or d.get("before3") != "":
+                front_len = len(d.get("before"))
+                # print("1-", len(d.get("before")))
+            else:
+                front_len = len(d.get("before")) + len(d.get("charac"))
+                # print("2-", len(d.get("before")), len(d.get("charac")))
+                if d.get("before2") is not None:
+                    front_len += len(d.get("before2"))
+        else:
+            front_len = 0
+        text_index.append([i.start()+front_len, i.end()])
+        output_list.append(output)
+    return output_list, text_index
+
+
+def calculateLen(ss, i):
+    front_len = 0
+    back_len = 0
+    for index in range(i):
+        front_len += len(ss[index])
+    for index in range(i+1, len(ss)):
+        back_len += len(ss[index])
+    return front_len, back_len
+
+
+def extract_servicetime(text):
+    list_servicetime = []
+    word_list, text_index_list = re_serviceTime(text)
+    # print(word, text_index_list)
+    for i in range(len(text_index_list)):
+        d = {"body": word_list[i], "begin_index": text_index_list[i][0], "end_index": text_index_list[i][1]}
+        if len(word_list[i]) <= 35:
+            list_servicetime.append(d)
+    if TEST_MODE:
+        print("list_servicetime", list_servicetime)
+    return list_servicetime
+
+
+def test_from_str():
+    # s = """
+    # 青岛市即墨区新兴中学物业管理服务项目 信息公开 合同公告 一、合同编号:D202101110008 二、合同名称:物业管理服务项目 三、项目编码(或招标编号、政府采购计划编号、采购计划备案文号等,如有):D202101110008 四、项目名称:物业管理服务项目 五、合同主体 采购人(甲方):青岛市即墨区新兴中学 地址:新兴路288号 联系方式:0532-88509712 供应商(乙方):青岛安之信物业管理有限公司 地址:山东省青岛市即墨区振华街87号恒生源大厦5楼 联系方式:0532-88510757 15966863456 六、合同主要信息 主要标的名称:物业管理服务 规格型号(或服务要求):1、所有上岗人员均要求符合相应年龄、性别及学历标准,身体健康,品德端正,无任何违法犯罪记录。 2、门卫需24小时在岗等。 3、卫生保洁人员:负责学校公共区域卫生保洁等。 4、校园绿化人员:根据季节要求,规范养护校园植物等。 5、公共设施设备维修维护:维修 主要标的数量:1.0 主要标的单价:277200.0 合同金额:27.72 万元 履约期限、地点等简要信息:2021-01-31、即墨区新兴中学 采购方式:网上超市 七、合同签订日期:2021-01-11 八、合同公告日期:2021-01-11 九、其他补充事宜: 附件: 『查看附件』 发 布 人:青岛市即墨区新兴中学 发布时间: 2021年1月11日
+    # """
+    s = " 服务周期/到货期:2022年6月1日-2022年12月31日。 "
+    print(extract_servicetime(s))
+
+
+def test_from_csv():
+    df = pd.read_csv("D:/BIDI_DOC/招标方式_服务期限_提取/serviceTime_text.csv")
+    result_list = []
+    for index, row in df.iterrows():
+        result = extract_servicetime(row["text"])
+        result_list.append(str(result))
+
+    df["new_word"] = pd.DataFrame(result_list)
+    df.to_csv("D:/BIDI_DOC/招标方式_服务期限_提取/serviceTime_text_new.csv")
+
+
+def test_from_xlsx():
+    df = pd.read_excel("D:/BIDI_DOC/比地_文档/service_time_error.xlsx")
+    result_list = []
+    for index, row in df.iterrows():
+        text = row["dochtmlcon"]
+        soup = BeautifulSoup(text, "lxml")
+        text = soup.get_text(strip=True)
+        result = extract_servicetime(text)
+        result_list.append(str(result))
+
+    df["new_word"] = pd.DataFrame(result_list)
+    df.to_excel("D:/BIDI_DOC/比地_文档/service_time_error_new.xlsx", index=False)
+
+
+if __name__ == '__main__':
+    test_from_str()

+ 17 - 0
BiddingKG/restart_extract.sh

@@ -0,0 +1,17 @@
+#!/bin/bash
+
+#ps -ef | grep run_extract_server | grep -v grep |cut -c 9-16 |xargs kill -9
+
+
+
+
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15030 > /data/python/extract_15030.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15031 > /data/python/extract_15031.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15032 > /data/python/extract_15032.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15033 > /data/python/extract_15033.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15034 > /data/python/extract_15033.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15035 > /data/python/extract_15033.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15036 > /data/python/extract_15033.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15037 > /data/python/extract_15033.log &
+##nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15038 > /data/python/extract_15033.log &
+##nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15039 > /data/python/extract_15033.log &

+ 150 - 0
BiddingKG/run_extract_server.py

@@ -0,0 +1,150 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun  1 18:03:03 2018
+
+@author: DONG
+"""
+import sys
+import os
+from flask import Flask, jsonify
+from flask import abort
+from flask import request
+
+
+sys.path.append(os.path.dirname(__file__)+"/..")
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+app = Flask(__name__)
+app.config['JSON_AS_ASCII'] = False
+
+
+import time
+import uuid
+from BiddingKG.dl.common.Utils import log
+from BiddingKG.dl.interface.extract import predict
+import numpy as np
+import ctypes
+import inspect
+from threading import Thread
+import traceback
+import json
+
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+sys.path.append(os.path.abspath("."))
+
+
+#自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+def _async_raise(tid, exctype):
+    """raises the exception, performs cleanup if needed"""
+    tid = ctypes.c_long(tid)
+    if not inspect.isclass(exctype):
+        exctype = type(exctype)
+    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
+    if res == 0:
+        raise ValueError("invalid thread id")
+    elif res != 1:
+        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
+        raise SystemError("PyThreadState_SetAsyncExc failed")
+
+def stop_thread(thread):
+    _async_raise(thread.ident, SystemExit)
+
+
+def run_thread(data,list_result):
+    # data = data.decode("utf8")
+    # data = json.loads(data,encoding="utf8")
+    k = str(uuid.uuid4())
+    cost_time = dict()
+    _doc_id = data.get("doc_id","")
+    _title = data.get("title","")
+    _content = data.get("content","")
+    _page_time = data.get("page_time","")
+    data_res = ""
+
+    web_source_no = data.get("web_source_no","")
+    original_docchannel = data.get("original_docchannel","")
+    is_fail = False
+    try:
+        if _content!="":
+            data_res  = predict(_doc_id,_content,_title,_page_time,web_source_no,original_docchannel)
+        else:
+            data_res = json.dumps({"success":False,"msg":"content not passed"})
+            # is_fail = True
+
+
+    except Exception as e:
+        traceback.print_exc()
+        data_res = json.dumps({"success":False,"msg":str(e)})
+        is_fail = True
+    # 以json形式返回结果
+    #_resp = json.dumps(data_res,cls=MyEncoder)
+    #log(str(data["flag"])+str(data))
+    log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
+    list_result.append(data_res)
+    if is_fail:
+        list_result.append(is_fail)
+
+@app.route("/test",methods=['POST'])
+def test():
+    _time = time.time()
+    a = request.form.get("content")
+    log("get form takes %.2fs"%(time.time()-_time))
+    return json.dumps(sys.getsizeof(request.form)),201
+
+
+
+@app.route('/content_extract', methods=['POST'])
+def text_predict():
+
+    _time = time.time()
+    data = request.json
+
+    status_code = 200
+    list_result = []
+    _timeout = data.get("timeout",400)
+    log("get data cost:%.2fs"%((time.time()-_time)))
+    t = Thread(target=run_thread,args=(data,list_result))
+    start_time = time.time()
+    t.start()
+    t.join(_timeout)
+    if t.is_alive():
+        stop_thread(t)
+        status_code = 302#超时被kill
+        data_res = json.dumps({"success":False,"msg":"timeout"})
+    else:
+        # status_code += int((time.time()-start_time)%10+1)
+        status_code = 201
+        data_res = list_result[0]
+        if len(list_result)>1 and list_result[1] ==True:
+            status_code = 500
+    _resp = data_res
+    # _resp = predict(doc_id=_doc_id,text=_content,title=_title,page_time=_page_time)
+
+    return _resp,status_code
+
+def getPort(argv):
+    port = 15030
+    for item in argv:
+        _l = str(item).split("port=")
+        if len(_l)>1:
+            port = int(_l[-1])
+            break
+    return port
+
+if __name__ == '__main__':
+    port = getPort(argv=sys.argv)
+    app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
+    log("ContentExtractor running")
+    # app.run()

BIN
BiddingKG/vocab_word.pk


Algunos archivos no se mostraron porque demasiados archivos cambiaron en este cambio