|
@@ -12,6 +12,7 @@ _time = time.time()
|
|
from BiddingKG.dl.common.Utils import *
|
|
from BiddingKG.dl.common.Utils import *
|
|
from BiddingKG.dl.interface.Entitys import *
|
|
from BiddingKG.dl.interface.Entitys import *
|
|
import json
|
|
import json
|
|
|
|
+from BiddingKG.dl.common.constDict import ConstDict
|
|
|
|
|
|
def edit_distance(source,target):
|
|
def edit_distance(source,target):
|
|
dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
|
|
dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
|
|
@@ -145,42 +146,67 @@ def get_nlp_enterprise(list_entity):
|
|
|
|
|
|
return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num]
|
|
return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num]
|
|
|
|
|
|
|
|
+ENTERPRISE_HUGE = None
|
|
|
|
+
|
|
def getEnterprisePath():
|
|
def getEnterprisePath():
|
|
|
|
+ global ENTERPRISE_HUGE
|
|
filename_huge = "LEGAL_ENTERPRISE_HUGE.txt"
|
|
filename_huge = "LEGAL_ENTERPRISE_HUGE.txt"
|
|
huge_path = getFileFromSysPath(filename_huge)
|
|
huge_path = getFileFromSysPath(filename_huge)
|
|
if huge_path is None:
|
|
if huge_path is None:
|
|
- if os.path.exists(huge_path):
|
|
|
|
- log("enterprise path:%s"%(huge_path))
|
|
|
|
- return huge_path
|
|
|
|
|
|
+ if os.path.exists(filename_huge):
|
|
|
|
+ log("enterprise path:%s"%(filename_huge))
|
|
|
|
+ ENTERPRISE_HUGE = False
|
|
|
|
+ return filename_huge,ENTERPRISE_HUGE
|
|
else:
|
|
else:
|
|
log("enterprise path:%s"%(huge_path))
|
|
log("enterprise path:%s"%(huge_path))
|
|
- return huge_path
|
|
|
|
|
|
+ ENTERPRISE_HUGE = False
|
|
|
|
+ return huge_path,ENTERPRISE_HUGE
|
|
|
|
|
|
filename = "LEGAL_ENTERPRISE.txt"
|
|
filename = "LEGAL_ENTERPRISE.txt"
|
|
|
|
|
|
real_path = getFileFromSysPath(filename)
|
|
real_path = getFileFromSysPath(filename)
|
|
if real_path is None:
|
|
if real_path is None:
|
|
real_path = filename
|
|
real_path = filename
|
|
- log("enterprise path:%s"%(real_path))
|
|
|
|
- return real_path
|
|
|
|
|
|
+ log("ENTERPRISE path:%s"%(real_path))
|
|
|
|
+ ENTERPRISE_HUGE = False
|
|
|
|
+ return real_path,ENTERPRISE_HUGE
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
|
|
-DICT_ENTERPRISE = {}
|
|
|
|
DICT_ENTERPRISE_DONE = False
|
|
DICT_ENTERPRISE_DONE = False
|
|
|
|
+
|
|
|
|
+POOL_REDIS = None
|
|
|
|
+
|
|
|
|
+ENTERPRISE_KEY_LEN = 3
|
|
|
|
+
|
|
|
|
+ENTERPRISE_PREFIX_LEN = 3
|
|
|
|
+ENTERPRISE_TAIL_LEN = 3
|
|
|
|
+
|
|
|
|
+SET_ENTERPRISE = set()
|
|
|
|
+SET_PREFIX_ENTERPRISE = set()
|
|
|
|
+SET_TAIL_ENTERPRISE = set()
|
|
def getDict_enterprise():
|
|
def getDict_enterprise():
|
|
- global DICT_ENTERPRISE,DICT_ENTERPRISE_DONE
|
|
|
|
- real_path = getEnterprisePath()
|
|
|
|
|
|
+ global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
|
|
|
|
+ real_path,is_huge = getEnterprisePath()
|
|
with open(real_path,"r",encoding="UTF8") as f:
|
|
with open(real_path,"r",encoding="UTF8") as f:
|
|
for _e in f:
|
|
for _e in f:
|
|
if not _e:
|
|
if not _e:
|
|
continue
|
|
continue
|
|
_e = _e.strip()
|
|
_e = _e.strip()
|
|
if len(_e)>=4:
|
|
if len(_e)>=4:
|
|
- key_enter = _e[:4]
|
|
|
|
- if key_enter not in DICT_ENTERPRISE:
|
|
|
|
- DICT_ENTERPRISE[key_enter] = set()
|
|
|
|
- DICT_ENTERPRISE[key_enter].add(_e[4:])
|
|
|
|
|
|
+ key_enter = _e[:ENTERPRISE_KEY_LEN]
|
|
|
|
+ SET_PREFIX_ENTERPRISE.add(key_enter)
|
|
|
|
+ SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
|
|
|
|
+ if not is_huge:
|
|
|
|
+ SET_ENTERPRISE.add(_e)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ log("SET_PREFIX_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_PREFIX_ENTERPRISE)/1024/1024,len(SET_PREFIX_ENTERPRISE)))
|
|
|
|
+ log("SET_TAIL_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_TAIL_ENTERPRISE)/1024/1024,len(SET_TAIL_ENTERPRISE)))
|
|
|
|
+ log("SET_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_ENTERPRISE)/1024/1024,len(SET_ENTERPRISE)))
|
|
|
|
|
|
- log("dict_enterprise takes memory:%dM"%(sys.getsizeof(DICT_ENTERPRISE)/1024/1024))
|
|
|
|
# for _e in ["河南省柘源","建筑工程有限公司"]:
|
|
# for _e in ["河南省柘源","建筑工程有限公司"]:
|
|
# if not _e:
|
|
# if not _e:
|
|
# continue
|
|
# continue
|
|
@@ -191,7 +217,41 @@ def getDict_enterprise():
|
|
# DICT_ENTERPRISE[key_enter] = set()
|
|
# DICT_ENTERPRISE[key_enter] = set()
|
|
# DICT_ENTERPRISE[key_enter].add(_e[4:])
|
|
# DICT_ENTERPRISE[key_enter].add(_e[4:])
|
|
DICT_ENTERPRISE_DONE = True
|
|
DICT_ENTERPRISE_DONE = True
|
|
- return DICT_ENTERPRISE
|
|
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def init_redis_pool():
|
|
|
|
+ from BiddingKG.dl.common.pool import ConnectorPool
|
|
|
|
+ from BiddingKG.dl.common.source import getConnect_redis_baseline
|
|
|
|
+ global POOL_REDIS
|
|
|
|
+ if POOL_REDIS is None:
|
|
|
|
+ POOL_REDIS = ConnectorPool(init_num=1,max_num=10,method_init=getConnect_redis_baseline)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def is_enterprise_exist(enterprise_name):
|
|
|
|
+ global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS
|
|
|
|
+ if ENTERPRISE_HUGE:
|
|
|
|
+ log("use redis")
|
|
|
|
+ if POOL_REDIS is None:
|
|
|
|
+ init_redis_pool()
|
|
|
|
+ _db = POOL_REDIS.getConnector()
|
|
|
|
+ try:
|
|
|
|
+ _time = time.time()
|
|
|
|
+ _v = _db.get(enterprise_name)
|
|
|
|
+ log("redis take %.5f"%(time.time()-_time))
|
|
|
|
+ POOL_REDIS.putConnector(_db)
|
|
|
|
+ if _v is None:
|
|
|
|
+ return False
|
|
|
|
+ else:
|
|
|
|
+ return True
|
|
|
|
+ except Exception as e:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+ return False
|
|
|
|
+ else:
|
|
|
|
+ if enterprise_name in SET_ENTERPRISE:
|
|
|
|
+ return True
|
|
|
|
+ else:
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
|
|
import threading
|
|
import threading
|
|
import time
|
|
import time
|
|
@@ -212,19 +272,35 @@ def match_enterprise_max_first(sentence):
|
|
begin_index = 0
|
|
begin_index = 0
|
|
if len(sentence)>4:
|
|
if len(sentence)>4:
|
|
while True:
|
|
while True:
|
|
- if begin_index+4<len(sentence):
|
|
|
|
- key_enter = sentence[begin_index:begin_index+4]
|
|
|
|
- if key_enter in DICT_ENTERPRISE:
|
|
|
|
- for _i in range(MAX_ENTERPRISE_LEN-4+1):
|
|
|
|
- enter_name = sentence[begin_index+4:begin_index+MAX_ENTERPRISE_LEN-_i]
|
|
|
|
- if enter_name in DICT_ENTERPRISE[key_enter]:
|
|
|
|
- match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
|
|
|
|
- list_match.append(match_item)
|
|
|
|
- begin_index += (len(key_enter)+len(enter_name))-1
|
|
|
|
- break
|
|
|
|
|
|
+ if begin_index+ENTERPRISE_KEY_LEN<len(sentence):
|
|
|
|
+ key_enter = sentence[begin_index:begin_index+ENTERPRISE_KEY_LEN]
|
|
|
|
+
|
|
|
|
+ # if key_enter in DICT_ENTERPRISE:
|
|
|
|
+ # _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
|
|
|
|
+ # for _i in range(_len):
|
|
|
|
+ # enter_name = sentence[begin_index+ENTERPRISE_KEY_LEN:begin_index+_len-_i]
|
|
|
|
+ # if enter_name in DICT_ENTERPRISE[key_enter]:
|
|
|
|
+ # match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
|
|
|
|
+ # list_match.append(match_item)
|
|
|
|
+ # begin_index += (len(key_enter)+len(enter_name))-1
|
|
|
|
+ # break
|
|
|
|
+
|
|
|
|
+ if key_enter in SET_PREFIX_ENTERPRISE:
|
|
|
|
+ _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
|
|
|
|
+ for _i in range(_len):
|
|
|
|
+ enter_name = sentence[begin_index:begin_index+_len-_i]
|
|
|
|
+ enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
|
|
|
|
+ if enter_tail in SET_TAIL_ENTERPRISE:
|
|
|
|
+ if is_enterprise_exist(enter_name):
|
|
|
|
+ match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
|
|
|
|
+ print("match_item",key_enter,enter_name)
|
|
|
|
+ list_match.append(match_item)
|
|
|
|
+ begin_index += (len(key_enter)+len(enter_name))-1
|
|
|
|
+ break
|
|
begin_index += 1
|
|
begin_index += 1
|
|
else:
|
|
else:
|
|
break
|
|
break
|
|
|
|
+ print("======",list_match)
|
|
return list_match
|
|
return list_match
|
|
|
|
|
|
def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|
|
def calibrateEnterprise(list_articles,list_sentences,list_entitys):
|