|
@@ -155,11 +155,11 @@ def getEnterprisePath():
|
|
if huge_path is None:
|
|
if huge_path is None:
|
|
if os.path.exists(filename_huge):
|
|
if os.path.exists(filename_huge):
|
|
log("enterprise path:%s"%(filename_huge))
|
|
log("enterprise path:%s"%(filename_huge))
|
|
- ENTERPRISE_HUGE = False
|
|
|
|
|
|
+ ENTERPRISE_HUGE = True
|
|
return filename_huge,ENTERPRISE_HUGE
|
|
return filename_huge,ENTERPRISE_HUGE
|
|
else:
|
|
else:
|
|
log("enterprise path:%s"%(huge_path))
|
|
log("enterprise path:%s"%(huge_path))
|
|
- ENTERPRISE_HUGE = False
|
|
|
|
|
|
+ ENTERPRISE_HUGE = True
|
|
return huge_path,ENTERPRISE_HUGE
|
|
return huge_path,ENTERPRISE_HUGE
|
|
|
|
|
|
filename = "LEGAL_ENTERPRISE.txt"
|
|
filename = "LEGAL_ENTERPRISE.txt"
|
|
@@ -186,21 +186,31 @@ ENTERPRISE_TAIL_LEN = 3
|
|
SET_ENTERPRISE = set()
|
|
SET_ENTERPRISE = set()
|
|
SET_PREFIX_ENTERPRISE = set()
|
|
SET_PREFIX_ENTERPRISE = set()
|
|
SET_TAIL_ENTERPRISE = set()
|
|
SET_TAIL_ENTERPRISE = set()
|
|
|
|
+SET_PREFIX_ENTERPRISE_HUGE_FILE = "SET_PREFIX_ENTERPRISE_HUGE.pk"
|
|
|
|
+SET_TAIL_ENTERPRISE_HUGE_FILE = "SET_TAIL_ENTERPRISE_HUGE.pk"
|
|
def getDict_enterprise():
|
|
def getDict_enterprise():
|
|
global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
|
|
global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
|
|
real_path,is_huge = getEnterprisePath()
|
|
real_path,is_huge = getEnterprisePath()
|
|
- with open(real_path,"r",encoding="UTF8") as f:
|
|
|
|
- for _e in f:
|
|
|
|
- if not _e:
|
|
|
|
- continue
|
|
|
|
- _e = _e.strip()
|
|
|
|
- if len(_e)>=4:
|
|
|
|
- key_enter = _e[:ENTERPRISE_KEY_LEN]
|
|
|
|
- SET_PREFIX_ENTERPRISE.add(key_enter)
|
|
|
|
- SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
|
|
|
|
- if not is_huge:
|
|
|
|
- SET_ENTERPRISE.add(_e)
|
|
|
|
-
|
|
|
|
|
|
+ if is_huge:
|
|
|
|
+ if os.path.exists(SET_PREFIX_ENTERPRISE_HUGE_FILE) and os.path.exists(SET_TAIL_ENTERPRISE_HUGE_FILE):
|
|
|
|
+ SET_PREFIX_ENTERPRISE = load(SET_PREFIX_ENTERPRISE_HUGE_FILE)
|
|
|
|
+ SET_TAIL_ENTERPRISE = load(SET_TAIL_ENTERPRISE_HUGE_FILE)
|
|
|
|
+ else:
|
|
|
|
+ with open(real_path,"r",encoding="UTF8") as f:
|
|
|
|
+ for _e in f:
|
|
|
|
+ if not _e:
|
|
|
|
+ continue
|
|
|
|
+ _e = _e.strip()
|
|
|
|
+ if len(_e)>=4:
|
|
|
|
+ key_enter = _e[:ENTERPRISE_KEY_LEN]
|
|
|
|
+ SET_PREFIX_ENTERPRISE.add(key_enter)
|
|
|
|
+ SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
|
|
|
|
+ if not is_huge:
|
|
|
|
+ SET_ENTERPRISE.add(_e)
|
|
|
|
+ #仅在大文件情况下才使用缓存加载
|
|
|
|
+ if is_huge:
|
|
|
|
+ save(SET_PREFIX_ENTERPRISE,SET_PREFIX_ENTERPRISE_HUGE_FILE)
|
|
|
|
+ save(SET_TAIL_ENTERPRISE,SET_TAIL_ENTERPRISE_HUGE_FILE)
|
|
|
|
|
|
|
|
|
|
log("SET_PREFIX_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_PREFIX_ENTERPRISE)/1024/1024,len(SET_PREFIX_ENTERPRISE)))
|
|
log("SET_PREFIX_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_PREFIX_ENTERPRISE)/1024/1024,len(SET_PREFIX_ENTERPRISE)))
|
|
@@ -292,7 +302,7 @@ def match_enterprise_max_first(sentence):
|
|
enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
|
|
enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
|
|
if enter_tail in SET_TAIL_ENTERPRISE:
|
|
if enter_tail in SET_TAIL_ENTERPRISE:
|
|
if is_enterprise_exist(enter_name):
|
|
if is_enterprise_exist(enter_name):
|
|
- match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
|
|
|
|
|
|
+ match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}
|
|
print("match_item",key_enter,enter_name)
|
|
print("match_item",key_enter,enter_name)
|
|
list_match.append(match_item)
|
|
list_match.append(match_item)
|
|
begin_index += (len(key_enter)+len(enter_name))-1
|
|
begin_index += (len(key_enter)+len(enter_name))-1
|