瀏覽代碼

线上流程增加redis处理流

luojiehua 2 年之前
父節點
當前提交
74d0b0b8f6
共有 1 個文件被更改,包括 25 次插入15 次删除
  1. 25 15
      BiddingKG/dl/entityLink/entityLink.py

+ 25 - 15
BiddingKG/dl/entityLink/entityLink.py

@@ -155,11 +155,11 @@ def getEnterprisePath():
     if huge_path is None:
         if os.path.exists(filename_huge):
             log("enterprise path:%s"%(filename_huge))
-            ENTERPRISE_HUGE = False
+            ENTERPRISE_HUGE = True
             return filename_huge,ENTERPRISE_HUGE
     else:
         log("enterprise path:%s"%(huge_path))
-        ENTERPRISE_HUGE = False
+        ENTERPRISE_HUGE = True
         return huge_path,ENTERPRISE_HUGE
 
     filename = "LEGAL_ENTERPRISE.txt"
@@ -186,21 +186,31 @@ ENTERPRISE_TAIL_LEN = 3
 SET_ENTERPRISE = set()
 SET_PREFIX_ENTERPRISE = set()
 SET_TAIL_ENTERPRISE = set()
+SET_PREFIX_ENTERPRISE_HUGE_FILE = "SET_PREFIX_ENTERPRISE_HUGE.pk"
+SET_TAIL_ENTERPRISE_HUGE_FILE = "SET_TAIL_ENTERPRISE_HUGE.pk"
 def getDict_enterprise():
     global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
     real_path,is_huge = getEnterprisePath()
-    with open(real_path,"r",encoding="UTF8") as f:
-        for _e in f:
-            if not _e:
-                continue
-            _e = _e.strip()
-            if len(_e)>=4:
-                key_enter = _e[:ENTERPRISE_KEY_LEN]
-                SET_PREFIX_ENTERPRISE.add(key_enter)
-                SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
-                if not is_huge:
-                    SET_ENTERPRISE.add(_e)
-
+    if is_huge:
+        if os.path.exists(SET_PREFIX_ENTERPRISE_HUGE_FILE) and os.path.exists(SET_TAIL_ENTERPRISE_HUGE_FILE):
+            SET_PREFIX_ENTERPRISE = load(SET_PREFIX_ENTERPRISE_HUGE_FILE)
+            SET_TAIL_ENTERPRISE = load(SET_TAIL_ENTERPRISE_HUGE_FILE)
+        else:
+            with open(real_path,"r",encoding="UTF8") as f:
+                for _e in f:
+                    if not _e:
+                        continue
+                    _e = _e.strip()
+                    if len(_e)>=4:
+                        key_enter = _e[:ENTERPRISE_KEY_LEN]
+                        SET_PREFIX_ENTERPRISE.add(key_enter)
+                        SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
+                        if not is_huge:
+                            SET_ENTERPRISE.add(_e)
+            #仅在大文件情况下才使用缓存加载
+            if is_huge:
+                save(SET_PREFIX_ENTERPRISE,SET_PREFIX_ENTERPRISE_HUGE_FILE)
+                save(SET_TAIL_ENTERPRISE,SET_TAIL_ENTERPRISE_HUGE_FILE)
 
 
     log("SET_PREFIX_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_PREFIX_ENTERPRISE)/1024/1024,len(SET_PREFIX_ENTERPRISE)))
@@ -292,7 +302,7 @@ def match_enterprise_max_first(sentence):
                         enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
                         if enter_tail in SET_TAIL_ENTERPRISE:
                             if is_enterprise_exist(enter_name):
-                                match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
+                                match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}
                                 print("match_item",key_enter,enter_name)
                                 list_match.append(match_item)
                                 begin_index += (len(key_enter)+len(enter_name))-1