Sfoglia il codice sorgente

线上流程增加redis处理流

luojiehua 2 anni fa
parent
commit
5b98acad04

+ 71 - 0
BiddingKG/dl/common/constDict.py

@@ -0,0 +1,71 @@
+class ConstDict(object):
+    __slots__ = []
+
+    def __init__(self, dic):
+        for key, val in dic.items():
+            setattr(self, key, val)
+
+    def __iter__(self):
+        return iter(self.__slots__)
+
+    def __getitem__(self, item):
+        if not isinstance(item, str):
+            return None
+        return getattr(self, item)
+
+    def __setitem__(self, key, value):
+        return setattr(self, key, value)
+
+    def __contains__(self, item):
+        return hasattr(self, item)
+
+    def __repr__(self):
+        return self.__str__()
+
+    def __str__(self):
+        return str(dict(self.items()))
+
+    def get(self, key, default=None):
+        return getattr(self, key, default)
+
+    def update(self, dic):
+        for key, val in dic.iteritems():
+            if key in self.__slots__:
+                setattr(self, key, val)
+
+    def clear(self):
+        for key in self.__slots__:
+            if hasattr(self, key):
+                delattr(self, key)
+
+    def setdefault(self, key, val):
+        if hasattr(self, key):
+            return getattr(self, key)
+        else:
+            setattr(self, key, val)
+            return val
+
+    def pop(self, key, default=None):
+        if hasattr(self, key):
+            val = getattr(self, key)
+            delattr(self, key)
+            return val
+        return default
+
+    def iteritems(self):
+        return iter(self.items())
+
+    def items(self):
+        return [(key, getattr(self, key, None)) for key in self.__slots__ if hasattr(self, key)]
+
+    def iterkeys(self):
+        return iter(self.keys())
+
+    def itervalues(self):
+        return iter(self.values())
+
+    def keys(self):
+        return [key for key in self.__slots__ if hasattr(self, key)]
+
+    def values(self):
+        return [getattr(self, key, None) for key in self.__slots__ if hasattr(self, key)]

+ 50 - 0
BiddingKG/dl/common/pool.py

@@ -0,0 +1,50 @@
+
+from multiprocessing import RLock
+import queue
+
+
+class ConnectorPool():
+
+    def __init__(self,init_num,max_num,method_init,**kwargs):
+        self.connector_pool = queue.Queue()
+        for i in range(init_num):
+            self.connector_pool.put(method_init(**kwargs))
+        self.method_init = method_init
+        self.kwargs = kwargs
+        self._lock = RLock()
+        self.pool_size = init_num
+        self.max_num = max_num
+
+    def getConnector(self):
+        with self._lock:
+            if self.connector_pool.empty():
+                if self.pool_size<self.max_num:
+                    while 1:
+                        try:
+                            _conn = self.method_init(**self.kwargs)
+                            self.connector_pool.put(_conn)
+                            break
+                        except Exception as e:
+                            pass
+            _conn = self.connector_pool.get(block=True)
+            return _conn
+
+    def putConnector(self,_conn):
+        self.connector_pool.put(_conn)
+
+    def destory(self):
+        while 1:
+            try:
+                conn = self.connector_pool.get(False)
+                conn.close()
+            except Exception as e:
+                break
+
+    def __del__(self):
+        self.destory()
+
+
+
+
+if __name__ == '__main__':
+    print(1)

+ 13 - 0
BiddingKG/dl/common/source.py

@@ -0,0 +1,13 @@
+
+
+import redis
+
+REDIS_HOST="192.168.2.103"
+REDIS_PORT=6379
+REDIS_PASS="daf!#@#fdasf234"
+
+
+def getConnect_redis_baseline():
+    db = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT,
+                           db=6,password=REDIS_PASS)
+    return db

+ 101 - 25
BiddingKG/dl/entityLink/entityLink.py

@@ -12,6 +12,7 @@ _time = time.time()
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.Entitys import *
 import json
+from BiddingKG.dl.common.constDict import ConstDict
 
 def edit_distance(source,target):
     dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
@@ -145,42 +146,67 @@ def get_nlp_enterprise(list_entity):
 
     return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num]
 
+ENTERPRISE_HUGE = None
+
 def getEnterprisePath():
+    global ENTERPRISE_HUGE
     filename_huge = "LEGAL_ENTERPRISE_HUGE.txt"
     huge_path = getFileFromSysPath(filename_huge)
     if huge_path is None:
-        if os.path.exists(huge_path):
-            log("enterprise path:%s"%(huge_path))
-            return huge_path
+        if os.path.exists(filename_huge):
+            log("enterprise path:%s"%(filename_huge))
+            ENTERPRISE_HUGE = False
+            return filename_huge,ENTERPRISE_HUGE
     else:
         log("enterprise path:%s"%(huge_path))
-        return huge_path
+        ENTERPRISE_HUGE = False
+        return huge_path,ENTERPRISE_HUGE
 
     filename = "LEGAL_ENTERPRISE.txt"
 
     real_path = getFileFromSysPath(filename)
     if real_path is None:
         real_path = filename
-    log("enterprise path:%s"%(real_path))
-    return real_path
+    log("ENTERPRISE path:%s"%(real_path))
+    ENTERPRISE_HUGE = False
+    return real_path,ENTERPRISE_HUGE
+
+
+
 
-DICT_ENTERPRISE = {}
 DICT_ENTERPRISE_DONE = False
+
+POOL_REDIS = None
+
+ENTERPRISE_KEY_LEN = 3
+
+ENTERPRISE_PREFIX_LEN = 3
+ENTERPRISE_TAIL_LEN = 3
+
+SET_ENTERPRISE = set()
+SET_PREFIX_ENTERPRISE = set()
+SET_TAIL_ENTERPRISE = set()
 def getDict_enterprise():
-    global DICT_ENTERPRISE,DICT_ENTERPRISE_DONE
-    real_path = getEnterprisePath()
+    global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
+    real_path,is_huge = getEnterprisePath()
     with open(real_path,"r",encoding="UTF8") as f:
         for _e in f:
             if not _e:
                 continue
             _e = _e.strip()
             if len(_e)>=4:
-                key_enter = _e[:4]
-                if key_enter not in DICT_ENTERPRISE:
-                    DICT_ENTERPRISE[key_enter] = set()
-                DICT_ENTERPRISE[key_enter].add(_e[4:])
+                key_enter = _e[:ENTERPRISE_KEY_LEN]
+                SET_PREFIX_ENTERPRISE.add(key_enter)
+                SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
+                if not is_huge:
+                    SET_ENTERPRISE.add(_e)
+
+
+
+    log("SET_PREFIX_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_PREFIX_ENTERPRISE)/1024/1024,len(SET_PREFIX_ENTERPRISE)))
+    log("SET_TAIL_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_TAIL_ENTERPRISE)/1024/1024,len(SET_TAIL_ENTERPRISE)))
+    log("SET_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_ENTERPRISE)/1024/1024,len(SET_ENTERPRISE)))
 
-    log("dict_enterprise takes memory:%dM"%(sys.getsizeof(DICT_ENTERPRISE)/1024/1024))
     # for _e in ["河南省柘源","建筑工程有限公司"]:
     #     if not _e:
     #         continue
@@ -191,7 +217,41 @@ def getDict_enterprise():
     #             DICT_ENTERPRISE[key_enter] = set()
     #         DICT_ENTERPRISE[key_enter].add(_e[4:])
     DICT_ENTERPRISE_DONE = True
-    return DICT_ENTERPRISE
+
+
+def init_redis_pool():
+    from BiddingKG.dl.common.pool import ConnectorPool
+    from BiddingKG.dl.common.source import getConnect_redis_baseline
+    global POOL_REDIS
+    if POOL_REDIS is None:
+        POOL_REDIS = ConnectorPool(init_num=1,max_num=10,method_init=getConnect_redis_baseline)
+
+
+def is_enterprise_exist(enterprise_name):
+    global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS
+    if ENTERPRISE_HUGE:
+        log("use redis")
+        if POOL_REDIS is None:
+            init_redis_pool()
+        _db = POOL_REDIS.getConnector()
+        try:
+            _time = time.time()
+            _v = _db.get(enterprise_name)
+            log("redis take %.5f"%(time.time()-_time))
+            POOL_REDIS.putConnector(_db)
+            if _v is None:
+                return False
+            else:
+                return True
+        except Exception as e:
+            traceback.print_exc()
+        return False
+    else:
+        if enterprise_name in SET_ENTERPRISE:
+            return True
+        else:
+            return False
+
 
 import threading
 import time
@@ -212,19 +272,35 @@ def match_enterprise_max_first(sentence):
     begin_index = 0
     if len(sentence)>4:
         while True:
-            if begin_index+4<len(sentence):
-                key_enter = sentence[begin_index:begin_index+4]
-                if key_enter in DICT_ENTERPRISE:
-                    for _i in range(MAX_ENTERPRISE_LEN-4+1):
-                        enter_name = sentence[begin_index+4:begin_index+MAX_ENTERPRISE_LEN-_i]
-                        if enter_name in DICT_ENTERPRISE[key_enter]:
-                            match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
-                            list_match.append(match_item)
-                            begin_index += (len(key_enter)+len(enter_name))-1
-                            break
+            if begin_index+ENTERPRISE_KEY_LEN<len(sentence):
+                key_enter = sentence[begin_index:begin_index+ENTERPRISE_KEY_LEN]
+
+                # if key_enter in DICT_ENTERPRISE:
+                #     _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
+                #     for _i in range(_len):
+                #         enter_name = sentence[begin_index+ENTERPRISE_KEY_LEN:begin_index+_len-_i]
+                #         if enter_name in DICT_ENTERPRISE[key_enter]:
+                #             match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
+                #             list_match.append(match_item)
+                #             begin_index += (len(key_enter)+len(enter_name))-1
+                #             break
+
+                if key_enter in SET_PREFIX_ENTERPRISE:
+                    _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
+                    for _i in range(_len):
+                        enter_name = sentence[begin_index:begin_index+_len-_i]
+                        enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
+                        if enter_tail in SET_TAIL_ENTERPRISE:
+                            if is_enterprise_exist(enter_name):
+                                match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
+                                print("match_item",key_enter,enter_name)
+                                list_match.append(match_item)
+                                begin_index += (len(key_enter)+len(enter_name))-1
+                                break
                 begin_index += 1
             else:
                 break
+    print("======",list_match)
     return list_match
 
 def calibrateEnterprise(list_articles,list_sentences,list_entitys):