|
@@ -13,6 +13,7 @@ from BaseDataMaintenance.model.ots.document_product_dict_interface import *
|
|
|
from BaseDataMaintenance.model.ots.document import *
|
|
|
from BaseDataMaintenance.model.ots.attachment import *
|
|
|
from BaseDataMaintenance.model.ots.enterprise import *
|
|
|
+from BaseDataMaintenance.model.ots.project import *
|
|
|
|
|
|
from tablestore import *
|
|
|
|
|
@@ -24,18 +25,19 @@ from BaseDataMaintenance.maintenance.product.product_dict import Product_Dict_Ma
|
|
|
from apscheduler.schedulers.blocking import BlockingScheduler
|
|
|
|
|
|
from BaseDataMaintenance.maintenance.product.make_brand_pattern import *
|
|
|
-from BaseDataMaintenance.maintenance.product.product_dict import IS_SYNCHONIZED
|
|
|
+from BaseDataMaintenance.maintenance.product.product_dict import *
|
|
|
import logging
|
|
|
|
|
|
root = logging.getLogger()
|
|
|
root.setLevel(logging.INFO)
|
|
|
from uuid import uuid4
|
|
|
+from multiprocessing import Queue as PQueue
|
|
|
|
|
|
class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
def __init__(self):
|
|
|
super(Product_Manager, self).__init__()
|
|
|
- self.process_queue = Queue()
|
|
|
+ self.process_queue = PQueue()
|
|
|
self.ots_client = getConnect_ots()
|
|
|
|
|
|
self.set_id = set()
|
|
@@ -63,11 +65,12 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
return
|
|
|
bool_query = BoolQuery(must_queries=[RangeQuery(DOCUMENT_PRODUCT_TMP_STATUS,1,51)])
|
|
|
|
|
|
- rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product_temp","document_product_temp_index",
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_tmp_table_name,Document_product_tmp_table_name+"_index",
|
|
|
SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
|
|
|
columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
list_data = getRow_ots(rows)
|
|
|
_count = len(list_data)
|
|
|
+ log("producer %d/%d"%(q_size,total_count))
|
|
|
list_id = []
|
|
|
for _d in list_data:
|
|
|
_id = _d.get(DOCUMENT_PRODUCT_TMP_ID)
|
|
@@ -76,7 +79,7 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
list_id.append(_id)
|
|
|
self.process_queue.put(_d)
|
|
|
while next_token:
|
|
|
- rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product_temp","document_product_temp_index",
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_tmp_table_name,Document_product_tmp_table_name+"_index",
|
|
|
SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
list_data = getRow_ots(rows)
|
|
@@ -109,11 +112,14 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
|
|
|
def comsumer_handle(self,item,result_queue):
|
|
|
- self.standardize(item)
|
|
|
+ try:
|
|
|
+ self.standardize(item)
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
|
|
|
|
|
|
|
|
|
- def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]):
|
|
|
+ def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]):
|
|
|
'''
|
|
|
Standardizes the product data
|
|
|
通过匹配标准参数表进行标准化,匹配是非精确匹配,校验规则是?
|
|
@@ -135,13 +141,24 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
document_product_tmp = Document_product_tmp(tmp_dict)
|
|
|
|
|
|
+ tenderee = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_TENDEREE,"")
|
|
|
+
|
|
|
name = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,"")
|
|
|
brand = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,"")
|
|
|
specs = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,"")
|
|
|
parameters = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_PARAMETER,"")
|
|
|
|
|
|
+ name = name.replace(tenderee,"")
|
|
|
+ brand = brand.replace(tenderee,"")
|
|
|
+
|
|
|
+ original_name = name
|
|
|
+ original_brand = brand
|
|
|
+ original_specs = specs
|
|
|
+
|
|
|
list_candidates = [a for a in [name,brand,specs,parameters] if a!=""]
|
|
|
|
|
|
+ list_candidate_brand_specs = [a for a in [brand,specs,parameters,name] if a!=""]
|
|
|
+
|
|
|
if brand=="" and parameters!="":
|
|
|
brand = parameters
|
|
|
if specs=="" and parameters!="":
|
|
@@ -156,20 +173,47 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
brand_ots_id = None
|
|
|
specs_ots_id = None
|
|
|
if name is not None and name!="":
|
|
|
- name_vector = get_embedding_request(name)
|
|
|
- if name_vector is not None:
|
|
|
+ Coll,_ = self.get_collection(NAME_GRADE)
|
|
|
+
|
|
|
+ search_list = get_intellect_search(Coll,embedding_index_name,name,NAME_GRADE,self.search_params,output_fields,limit=10)
|
|
|
+
|
|
|
+ for _search in search_list:
|
|
|
+ ots_id = _search.get("standard_name_id")
|
|
|
+ ots_name = _search.get("ots_name")
|
|
|
+ standard_name = _search.get("standard_name")
|
|
|
+ ots_parent_id = _search.get("ots_parent_id")
|
|
|
+ remove_words = _search.get("remove_words")
|
|
|
+
|
|
|
+ if check_product(name,ots_name,remove_words):
|
|
|
+ name_ots_id = get_document_product_dict_id(ots_parent_id,standard_name)
|
|
|
+ original_name = name
|
|
|
+ new_name = standard_name
|
|
|
+
|
|
|
+ log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
|
|
|
+ # #update alias of name
|
|
|
+ # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
|
|
|
+ # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
|
|
|
+ # if _flag and _dpd.updateAlias(name):
|
|
|
+ # _dpd.update_row(self.ots_client)
|
|
|
+ break
|
|
|
+ if name_ots_id is None:
|
|
|
+ for name in list_candidates:
|
|
|
Coll,_ = self.get_collection(NAME_GRADE)
|
|
|
-
|
|
|
- search_list = get_embedding_search(Coll,embedding_index_name,name,NAME_GRADE,[name_vector],self.search_params,output_fields,limit=60)
|
|
|
+ search_list = get_intellect_search(Coll,embedding_index_name,name,NAME_GRADE,self.search_params,output_fields,limit=10)
|
|
|
|
|
|
for _search in search_list:
|
|
|
ots_id = _search.get("standard_name_id")
|
|
|
- ots_name = _search.get("standard_name")
|
|
|
+ ots_name = _search.get("ots_name")
|
|
|
+ standard_name = _search.get("standard_name")
|
|
|
ots_parent_id = _search.get("ots_parent_id")
|
|
|
+ remove_words = _search.get("remove_words")
|
|
|
+
|
|
|
+ if check_product(name,ots_name,remove_words):
|
|
|
|
|
|
- if is_similar(name,ots_name) or check_product(name,ots_name):
|
|
|
- name_ots_id = ots_id
|
|
|
- new_name = ots_name
|
|
|
+ log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
|
|
|
+ name_ots_id = get_document_product_dict_id(ots_parent_id,standard_name)
|
|
|
+ original_name = name
|
|
|
+ new_name = standard_name
|
|
|
|
|
|
# #update alias of name
|
|
|
# _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
|
|
@@ -177,110 +221,104 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
# if _flag and _dpd.updateAlias(name):
|
|
|
# _dpd.update_row(self.ots_client)
|
|
|
break
|
|
|
- if name_ots_id is None:
|
|
|
- for name in list_candidates:
|
|
|
- name_vector = get_embedding_request(name)
|
|
|
- if name_vector is not None:
|
|
|
- Coll,_ = self.get_collection(NAME_GRADE)
|
|
|
- search_list = get_embedding_search(Coll,embedding_index_name,name,NAME_GRADE,[name_vector],self.search_params,output_fields,limit=20)
|
|
|
-
|
|
|
- for _search in search_list:
|
|
|
- ots_id = _search.get("standard_name_id")
|
|
|
- ots_name = _search.get("standard_name")
|
|
|
- ots_parent_id = _search.get("ots_parent_id")
|
|
|
-
|
|
|
- if is_similar(name,ots_name) or check_product(name,ots_name):
|
|
|
- name_ots_id = ots_id
|
|
|
- new_name = ots_name
|
|
|
-
|
|
|
- # #update alias of name
|
|
|
- # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
|
|
|
- # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
|
|
|
- # if _flag and _dpd.updateAlias(name):
|
|
|
- # _dpd.update_row(self.ots_client)
|
|
|
- break
|
|
|
if name_ots_id is not None:
|
|
|
|
|
|
if brand is not None and brand!="":
|
|
|
|
|
|
s_brand = brand
|
|
|
l_brand = [brand]
|
|
|
- l_brand.append(clean_product_brand(s_brand))
|
|
|
- brand_ch = get_chinese_string(brand)
|
|
|
- l_brand.extend(brand_ch)
|
|
|
+
|
|
|
+ Coll,_ = self.get_collection(BRAND_GRADE)
|
|
|
|
|
|
_find = False
|
|
|
for brand in l_brand:
|
|
|
|
|
|
- brand_vector = get_embedding_request(brand)
|
|
|
- if brand_vector is not None:
|
|
|
- Coll,_ = self.get_collection(BRAND_GRADE)
|
|
|
- search_list = get_embedding_search(Coll,embedding_index_name,brand,BRAND_GRADE,[brand_vector],self.search_params,output_fields,limit=60)
|
|
|
+ if len(brand)>100:
|
|
|
+ continue
|
|
|
+ search_list = get_intellect_search(Coll,embedding_index_name,brand,BRAND_GRADE,self.search_params,output_fields,limit=10)
|
|
|
|
|
|
- # log("search brand %s"%(brand))
|
|
|
- for _search in search_list:
|
|
|
+ # log("search brand %s"%(brand))
|
|
|
+ for _search in search_list:
|
|
|
|
|
|
- ots_id = _search.get("standard_name_id")
|
|
|
- ots_name = _search.get("standard_name")
|
|
|
- ots_parent_id = _search.get("ots_parent_id")
|
|
|
+ ots_id = _search.get("standard_name_id")
|
|
|
+ ots_name = _search.get("ots_name")
|
|
|
+ standard_name = _search.get("standard_name")
|
|
|
+ ots_parent_id = _search.get("ots_parent_id")
|
|
|
+ remove_words = _search.get("remove_words")
|
|
|
|
|
|
- # log("check brand %s and %s"%(brand,ots_name))
|
|
|
- if is_similar(brand,ots_name) or check_brand(brand,ots_name):
|
|
|
+ # log("check brand %s and %s"%(brand,ots_name))
|
|
|
+ if check_brand(brand,ots_name,remove_words):
|
|
|
|
|
|
- # log("check brand similar succeed:%s and %s"%(brand,ots_name))
|
|
|
+ # log("check brand similar succeed:%s and %s"%(brand,ots_name))
|
|
|
|
|
|
- if ots_name==new_name:
|
|
|
+ if ots_name==new_name:
|
|
|
+ continue
|
|
|
+ original_brand = brand
|
|
|
+ if original_brand==original_name:
|
|
|
+ if original_brand.find(ots_name)>=1:
|
|
|
continue
|
|
|
- new_brand = ots_name
|
|
|
-
|
|
|
- log("checking brand %s succeed %s"%(brand,new_brand))
|
|
|
- # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
|
|
|
-
|
|
|
- if name_ots_id is not None:
|
|
|
- brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
|
|
|
-
|
|
|
- _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
|
|
|
- DOCUMENT_PRODUCT_DICT_NAME:new_brand,
|
|
|
- DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
|
|
|
- DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
|
|
|
- DOCUMENT_PRODUCT_DICT_STATUS:1,
|
|
|
- DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
|
|
|
- DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
|
|
|
- DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
- DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
- }
|
|
|
- _dpd_brand = Document_product_dict(_d_brand)
|
|
|
- # _dpd_brand.updateAlias(str(new_brand).lower())
|
|
|
- if not _dpd_brand.exists_row(self.ots_client):
|
|
|
- _dpd_brand.update_row(self.ots_client)
|
|
|
-
|
|
|
- else:
|
|
|
- pass
|
|
|
- # #update alias
|
|
|
- # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
|
|
|
- # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
|
|
|
- # if _flag:
|
|
|
- # if _dpd.updateAlias(brand):
|
|
|
- # _dpd.update_row(self.ots_client)
|
|
|
+ if len(original_brand)<=3:
|
|
|
+ continue
|
|
|
+ new_brand = standard_name
|
|
|
+
|
|
|
+ log("checking brand %s succeed %s"%(brand,new_brand))
|
|
|
+ # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
|
|
|
+
|
|
|
+ if name_ots_id is not None:
|
|
|
+ brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
|
|
|
+
|
|
|
+ _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
|
|
|
+ DOCUMENT_PRODUCT_DICT_NAME:new_brand,
|
|
|
+ DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_brand).lower()),
|
|
|
+ DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
|
|
|
+ DOCUMENT_PRODUCT_DICT_STATUS:1,
|
|
|
+ DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
|
|
|
+ DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
|
|
|
+ DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
+ DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
+ }
|
|
|
+ _dpd_brand = Document_product_dict(_d_brand)
|
|
|
+ # _dpd_brand.updateAlias(str(new_brand).lower())
|
|
|
+ if not _dpd_brand.exists_row(self.ots_client):
|
|
|
+ _dpd_brand.update_row(self.ots_client)
|
|
|
|
|
|
- _find = True
|
|
|
- break
|
|
|
- else:
|
|
|
- # log("check brand similar failed:%s and %s"%(brand,ots_name))
|
|
|
- # add new brand?
|
|
|
- pass
|
|
|
- if _find:
|
|
|
+ else:
|
|
|
+ pass
|
|
|
+ # #update alias
|
|
|
+ # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
|
|
|
+ # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
|
|
|
+ # if _flag:
|
|
|
+ # if _dpd.updateAlias(brand):
|
|
|
+ # _dpd.update_row(self.ots_client)
|
|
|
+
|
|
|
+ _find = True
|
|
|
break
|
|
|
+ else:
|
|
|
+ # log("check brand similar failed:%s and %s"%(brand,ots_name))
|
|
|
+ # add new brand?
|
|
|
+ pass
|
|
|
+ if _find:
|
|
|
+ break
|
|
|
if not _find:
|
|
|
for brand in l_brand:
|
|
|
+ if len(brand)>100:
|
|
|
+ continue
|
|
|
if self.check_new_brand(brand):
|
|
|
new_brand = clean_product_brand(brand)
|
|
|
if new_brand=="":
|
|
|
continue
|
|
|
+ original_brand = brand
|
|
|
+ if original_brand==original_name:
|
|
|
+ if new_name==original_brand:
|
|
|
+ continue
|
|
|
+ if original_brand.find(new_brand)>=1:
|
|
|
+ continue
|
|
|
+ if len(original_brand)<=3:
|
|
|
+ continue
|
|
|
log("adding new brand %s"%(str(new_brand)))
|
|
|
_d_brand = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
|
|
|
DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_brand,
|
|
|
- DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(str(brand).lower()),
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(str(new_brand).lower()),
|
|
|
DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:BRAND_GRADE,
|
|
|
DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
|
|
|
DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:name_ots_id,
|
|
@@ -294,74 +332,74 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
if brand_ots_id is None:
|
|
|
_find = False
|
|
|
+ Coll,_ = self.get_collection(BRAND_GRADE)
|
|
|
for brand in list_candidates:
|
|
|
if _find:
|
|
|
break
|
|
|
l_brand = [brand]
|
|
|
- l_brand.append(clean_product_brand(brand))
|
|
|
- brand_ch = get_chinese_string(brand)
|
|
|
- l_brand.extend(brand_ch)
|
|
|
|
|
|
for brand in l_brand:
|
|
|
+ if len(brand)>100:
|
|
|
+ continue
|
|
|
if _find:
|
|
|
break
|
|
|
- start_time = time.time()
|
|
|
- # brand_vector = request_embedding(brand)
|
|
|
- brand_vector = get_embedding_request(brand)
|
|
|
- debug("get embedding for brand %s takes %.4fs"%(brand,time.time()-start_time))
|
|
|
- if brand_vector is not None:
|
|
|
- Coll,_ = self.get_collection(BRAND_GRADE)
|
|
|
- start_time = time.time()
|
|
|
- # search_list = search_embedding(Coll,embedding_index_name,[brand_vector],self.search_params,output_fields,limit=10)
|
|
|
- search_list = get_embedding_search(Coll,embedding_index_name,brand,BRAND_GRADE,[brand_vector],self.search_params,output_fields,limit=10)
|
|
|
- debug("get search_list for brand %s takes %.4fs"%(brand,time.time()-start_time))
|
|
|
- # log("search brand %s"%(brand))
|
|
|
- for _search in search_list:
|
|
|
-
|
|
|
-
|
|
|
- ots_id = _search.get("standard_name_id")
|
|
|
- ots_name = _search.get("standard_name")
|
|
|
- ots_parent_id = _search.get("ots_parent_id")
|
|
|
-
|
|
|
- # log("check brand %s and %s"%(brand,ots_name))
|
|
|
- if is_similar(brand,ots_name,_radio=95) or check_brand(brand,ots_name):
|
|
|
- # log("check brand similar succeed:%s and %s"%(brand,ots_name))
|
|
|
- if ots_name==new_name:
|
|
|
+
|
|
|
+ search_list = get_intellect_search(Coll,embedding_index_name,brand,BRAND_GRADE,self.search_params,output_fields,limit=10)
|
|
|
+ # log("search brand %s"%(brand))
|
|
|
+ for _search in search_list:
|
|
|
+
|
|
|
+ ots_id = _search.get("standard_name_id")
|
|
|
+ ots_name = _search.get("ots_name")
|
|
|
+ standard_name = _search.get("standard_name")
|
|
|
+ ots_parent_id = _search.get("ots_parent_id")
|
|
|
+ remove_words = _search.get("remove_words")
|
|
|
+
|
|
|
+ # log("check brand %s and %s"%(brand,ots_name))
|
|
|
+ if check_brand(brand,ots_name,remove_words):
|
|
|
+ # log("check brand similar succeed:%s and %s"%(brand,ots_name))
|
|
|
+ if ots_name==new_name:
|
|
|
+ continue
|
|
|
+
|
|
|
+ original_brand = brand
|
|
|
+ if original_brand==original_name:
|
|
|
+ if original_brand.find(ots_name)>=1:
|
|
|
continue
|
|
|
- new_brand = ots_name
|
|
|
+ if len(original_brand)<=3:
|
|
|
+ continue
|
|
|
+ new_brand = standard_name
|
|
|
|
|
|
- log("checking brand %s succeed %s"%(brand,new_brand))
|
|
|
- # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
|
|
|
+ log("checking brand %s succeed %s"%(brand,new_brand))
|
|
|
+ # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
|
|
|
|
|
|
- if name_ots_id is not None:
|
|
|
- brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
|
|
|
+ if name_ots_id is not None:
|
|
|
+ brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
|
|
|
|
|
|
- _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
|
|
|
- DOCUMENT_PRODUCT_DICT_NAME:new_brand,
|
|
|
- DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
|
|
|
- DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
|
|
|
- DOCUMENT_PRODUCT_DICT_STATUS:1,
|
|
|
- DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
|
|
|
- DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
|
|
|
- DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
- DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
- }
|
|
|
- _dpd_brand = Document_product_dict(_d_brand)
|
|
|
- # _dpd_brand.updateAlias(str(new_brand).lower())
|
|
|
- if not _dpd_brand.exists_row(self.ots_client):
|
|
|
- _dpd_brand.update_row(self.ots_client)
|
|
|
+ _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
|
|
|
+ DOCUMENT_PRODUCT_DICT_NAME:new_brand,
|
|
|
+ DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_brand).lower()),
|
|
|
+ DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
|
|
|
+ DOCUMENT_PRODUCT_DICT_STATUS:1,
|
|
|
+ DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
|
|
|
+ DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
|
|
|
+ DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
+ DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
+ }
|
|
|
+ _dpd_brand = Document_product_dict(_d_brand)
|
|
|
+ # _dpd_brand.updateAlias(str(new_brand).lower())
|
|
|
+ if not _dpd_brand.exists_row(self.ots_client):
|
|
|
+ _dpd_brand.update_row(self.ots_client)
|
|
|
|
|
|
- else:
|
|
|
- pass
|
|
|
- # #update alias
|
|
|
- # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
|
|
|
- # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
|
|
|
- # if _flag:
|
|
|
- # if _dpd.updateAlias(brand):
|
|
|
- # _dpd.update_row(self.ots_client)
|
|
|
+ else:
|
|
|
+ pass
|
|
|
+ # #update alias
|
|
|
+ # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
|
|
|
+ # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
|
|
|
+ # if _flag:
|
|
|
+ # if _dpd.updateAlias(brand):
|
|
|
+ # _dpd.update_row(self.ots_client)
|
|
|
|
|
|
- _find = True
|
|
|
- break
|
|
|
+ _find = True
|
|
|
+ break
|
|
|
|
|
|
if specs is not None and specs!="":
|
|
|
|
|
@@ -374,9 +412,9 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
for s in re.split("[\u4e00-\u9fff]",specs):
|
|
|
if s!="" and len(s)>4:
|
|
|
list_specs.append(s)
|
|
|
- similar_flag = None
|
|
|
_index = 0
|
|
|
break_flag = False
|
|
|
+ list_similar_specs = []
|
|
|
for c_specs in list_specs:
|
|
|
if break_flag:
|
|
|
break
|
|
@@ -385,12 +423,13 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
if specs_vector is not None:
|
|
|
Coll,_ = self.get_collection(SPECS_GRADE)
|
|
|
- search_list = get_embedding_search(Coll,embedding_index_name,c_specs,SPECS_GRADE,[specs_vector],self.search_params,output_fields,limit=60)
|
|
|
+ search_list = get_embedding_search(Coll,embedding_index_name,c_specs,SPECS_GRADE,[specs_vector],self.search_params,output_fields,limit=20)
|
|
|
|
|
|
for _search in search_list:
|
|
|
|
|
|
ots_id = _search.get("standard_name_id")
|
|
|
- ots_name = _search.get("standard_name")
|
|
|
+ ots_name = _search.get("ots_name")
|
|
|
+ standard_name = _search.get("standard_name")
|
|
|
ots_parent_id = _search.get("ots_parent_id")
|
|
|
|
|
|
debug("checking specs %s and %s"%(specs,ots_name))
|
|
@@ -398,7 +437,10 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
# log("specs is_similar")
|
|
|
if check_specs(c_specs,ots_name):
|
|
|
break_flag = True
|
|
|
- new_specs = ots_name
|
|
|
+ original_specs = c_specs
|
|
|
+ if standard_name==new_name:
|
|
|
+ continue
|
|
|
+ new_specs = standard_name
|
|
|
log("check_specs %s succeed %s"%(specs,new_specs))
|
|
|
|
|
|
# to update the document_product_dict which is builded for search
|
|
@@ -408,7 +450,7 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
_d_specs = {DOCUMENT_PRODUCT_DICT_ID:specs_ots_id,
|
|
|
DOCUMENT_PRODUCT_DICT_NAME:new_specs,
|
|
|
- DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(specs).lower()),
|
|
|
+ DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_specs).lower()),
|
|
|
DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
|
|
|
DOCUMENT_PRODUCT_DICT_STATUS:1,
|
|
|
DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
|
|
@@ -429,51 +471,60 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
# if _flag:
|
|
|
# if _dpd.updateAlias(specs):
|
|
|
# _dpd.update_row(self.ots_client)
|
|
|
+ break_flag = True
|
|
|
break
|
|
|
- else:
|
|
|
- if _index == 1:
|
|
|
- similar_flag = True
|
|
|
-
|
|
|
+ else:
|
|
|
+ list_similar_specs.append(specs)
|
|
|
# add new specs?
|
|
|
- debug("specs not similar")
|
|
|
- if is_legal_specs(specs) and len(specs)<MAX_NAME_LENGTH and len(specs)>=5:
|
|
|
- debug("is_legal_specs")
|
|
|
- new_specs = clean_product_specs(specs)
|
|
|
- # insert into document_product_dict a new record
|
|
|
- # to update the document_product_dict which is builded for search
|
|
|
- # add new specs
|
|
|
- if brand_ots_id is not None and name_ots_id is not None:
|
|
|
- _md5 = get_document_product_dict_id(brand_ots_id,new_specs)
|
|
|
-
|
|
|
- # _d = {DOCUMENT_PRODUCT_DICT_ID:_md5,
|
|
|
- # DOCUMENT_PRODUCT_DICT_NAME:new_specs,
|
|
|
- # DOCUMENT_PRODUCT_DICT_ALIAS:"%s&&%s"%(specs,new_specs),
|
|
|
- # DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
|
|
|
- # DOCUMENT_PRODUCT_DICT_STATUS:1,
|
|
|
- # DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
|
|
|
- # DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
- # DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
- # }
|
|
|
- # _dpd = Document_product_dict(_d)
|
|
|
- # _dpd.update_row(self.ots_client)
|
|
|
-
|
|
|
- log("adding new specs %s"%(new_specs))
|
|
|
- # user interface to add
|
|
|
- _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
|
|
|
- DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_specs,
|
|
|
- DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(new_specs.lower()),
|
|
|
- DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:SPECS_GRADE,
|
|
|
- DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
|
|
|
- DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:brand_ots_id,
|
|
|
- DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
- DOCUMENT_PRODUCT_DICT_INTERFACE_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
- DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"insert"
|
|
|
- }
|
|
|
- _dpdi = Document_product_dict_interface(_d)
|
|
|
- _dpdi.update_row(self.ots_client)
|
|
|
+ if new_specs is not None and new_specs!="":
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ debug("specs not similar")
|
|
|
+ for specs in list_similar_specs:
|
|
|
+ if is_legal_specs(specs) and len(specs)<MAX_NAME_LENGTH and len(specs)>=5:
|
|
|
+ debug("is_legal_specs")
|
|
|
+ original_specs = specs
|
|
|
+
|
|
|
+ new_specs = clean_product_specs(specs)
|
|
|
+ if new_specs==new_name:
|
|
|
+ new_specs = ""
|
|
|
+ continue
|
|
|
+ # insert into document_product_dict a new record
|
|
|
+ # to update the document_product_dict which is builded for search
|
|
|
+ # add new specs
|
|
|
+ if brand_ots_id is not None and name_ots_id is not None:
|
|
|
+ specs_ots_id = get_document_product_dict_id(brand_ots_id,new_specs)
|
|
|
+
|
|
|
+ # _d = {DOCUMENT_PRODUCT_DICT_ID:_md5,
|
|
|
+ # DOCUMENT_PRODUCT_DICT_NAME:new_specs,
|
|
|
+ # DOCUMENT_PRODUCT_DICT_ALIAS:"%s&&%s"%(specs,new_specs),
|
|
|
+ # DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
|
|
|
+ # DOCUMENT_PRODUCT_DICT_STATUS:1,
|
|
|
+ # DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
|
|
|
+ # DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
+ # DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
+ # }
|
|
|
+ # _dpd = Document_product_dict(_d)
|
|
|
+ # _dpd.update_row(self.ots_client)
|
|
|
+
|
|
|
+ log("adding new specs %s"%(new_specs))
|
|
|
+ # user interface to add
|
|
|
+ _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_specs,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(new_specs.lower()),
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:SPECS_GRADE,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:brand_ots_id,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"insert"
|
|
|
+ }
|
|
|
+ _dpdi = Document_product_dict_interface(_d)
|
|
|
+ _dpdi.update_row(self.ots_client)
|
|
|
+ break
|
|
|
if specs_ots_id is None:
|
|
|
_find = False
|
|
|
- for specs in list_candidates:
|
|
|
+ for specs in list_candidate_brand_specs:
|
|
|
if _find:
|
|
|
break
|
|
|
|
|
@@ -495,29 +546,34 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
if specs_vector is not None:
|
|
|
Coll,_ = self.get_collection(SPECS_GRADE)
|
|
|
- search_list = get_embedding_search(Coll,embedding_index_name,c_specs,SPECS_GRADE,[specs_vector],self.search_params,output_fields,limit=20)
|
|
|
+ search_list = get_embedding_search(Coll,embedding_index_name,c_specs,SPECS_GRADE,[specs_vector],self.search_params,output_fields,limit=10)
|
|
|
|
|
|
for _search in search_list:
|
|
|
if _find:
|
|
|
break
|
|
|
|
|
|
ots_id = _search.get("standard_name_id")
|
|
|
- ots_name = _search.get("standard_name")
|
|
|
+ ots_name = _search.get("ots_name")
|
|
|
+ standard_name = _search.get("standard_name")
|
|
|
ots_parent_id = _search.get("ots_parent_id")
|
|
|
|
|
|
debug("checking specs %s and %s"%(specs,ots_name))
|
|
|
- if is_similar(specs,ots_name):
|
|
|
+ if is_similar(c_specs,ots_name):
|
|
|
# log("specs is_similar")
|
|
|
if check_specs(c_specs,ots_name):
|
|
|
break_flag = True
|
|
|
- new_specs = ots_name
|
|
|
+ original_specs = c_specs
|
|
|
+ new_specs = standard_name
|
|
|
+ if new_specs==new_name:
|
|
|
+ new_specs = ""
|
|
|
+ continue
|
|
|
if brand_ots_id is not None:
|
|
|
# judge if the specs which parent_id is brand_ots_id exists,insert one if not exists else update alias
|
|
|
specs_ots_id = get_document_product_dict_id(brand_ots_id,new_specs)
|
|
|
|
|
|
_d_specs = {DOCUMENT_PRODUCT_DICT_ID:specs_ots_id,
|
|
|
DOCUMENT_PRODUCT_DICT_NAME:new_specs,
|
|
|
- DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(specs).lower()),
|
|
|
+ DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_specs).lower()),
|
|
|
DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
|
|
|
DOCUMENT_PRODUCT_DICT_STATUS:1,
|
|
|
DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
|
|
@@ -557,11 +613,12 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
if unit_price>0:
|
|
|
new_quantity = total_price/unit_price
|
|
|
if new_quantity!=quantity:
|
|
|
- if new_quantity==total_price//unit_price:
|
|
|
- quantity = int(new_quantity)
|
|
|
- _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
|
|
|
- else:
|
|
|
- is_legal_data = False
|
|
|
+ # if new_quantity==total_price//unit_price:
|
|
|
+ # quantity = int(new_quantity)
|
|
|
+ # _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
|
|
|
+ # else:
|
|
|
+ # is_legal_data = False
|
|
|
+ is_legal_data = False
|
|
|
elif quantity>0:
|
|
|
unit_price = total_price/quantity
|
|
|
_product.setValue(DOCUMENT_PRODUCT_UNIT_PRICE,unit_price,True)
|
|
@@ -610,9 +667,9 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
_product.setValue(DOCUMENT_PRODUCT_CREATE_TIME,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
|
|
|
|
|
|
- _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,""),True)
|
|
|
- _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,""),True)
|
|
|
- _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,""),True)
|
|
|
+ _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,original_name,True)
|
|
|
+ _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,original_brand,True)
|
|
|
+ _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,original_specs,True)
|
|
|
|
|
|
bid_filemd5s = self.get_bid_filemd5s(docid,self.ots_client)
|
|
|
if bid_filemd5s is not None:
|
|
@@ -621,14 +678,16 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
if not is_legal_data:
|
|
|
_status = randint(501,550)
|
|
|
|
|
|
- elif self.dumplicate(_product):
|
|
|
- _status = randint(201,300)
|
|
|
- save_product_tmp.setValue(DOCUMENT_PRODUCT_TMP_NEW_ID,new_id,True)
|
|
|
-
|
|
|
- _product.update_row(self.ots_client)
|
|
|
-
|
|
|
else:
|
|
|
- _status = randint(451,500)
|
|
|
+ _flag,dump_id = self.dumplicate(_product)
|
|
|
+ if _flag:
|
|
|
+ _status = randint(201,300)
|
|
|
+ save_product_tmp.setValue(DOCUMENT_PRODUCT_TMP_NEW_ID,new_id,True)
|
|
|
+
|
|
|
+ _product.update_row(self.ots_client)
|
|
|
+ else:
|
|
|
+ _status = randint(451,500)
|
|
|
+ save_product_tmp.setValue(DOCUMENT_PRODUCT_DUMP_ID,str(dump_id),True)
|
|
|
|
|
|
else:
|
|
|
_status = randint(401,450)
|
|
@@ -692,11 +751,11 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
def get_value_count(self,name,brand,specs,unit_price,quantity):
|
|
|
|
|
|
value_count = 0
|
|
|
- if len(name)>0:
|
|
|
+ if name is not None and len(name)>0:
|
|
|
value_count += 1
|
|
|
- if len(brand)>0:
|
|
|
+ if brand is not None and len(brand)>0:
|
|
|
value_count += 1
|
|
|
- if len(specs)>0:
|
|
|
+ if specs is not None and len(specs)>0:
|
|
|
value_count += 1
|
|
|
if isinstance(unit_price,(float,int)) and unit_price>0:
|
|
|
value_count += 1
|
|
@@ -716,7 +775,8 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
tenderee = str(document_product.getProperties().get(DOCUMENT_PRODUCT_TENDEREE,""))
|
|
|
supplier = str(document_product.getProperties().get(DOCUMENT_PRODUCT_SUPPLIER,""))
|
|
|
|
|
|
-
|
|
|
+ base_value_count = self.get_value_count(name,brand,specs,unit_price,quantity)
|
|
|
+ list_dump_id = []
|
|
|
page_time_before = page_time
|
|
|
page_time_after = page_time
|
|
|
try:
|
|
@@ -725,6 +785,8 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
except Exception as e:
|
|
|
pass
|
|
|
|
|
|
+ to_save = 1
|
|
|
+
|
|
|
if len(name)>0 and len(brand)>0 and len(specs)>0 and isinstance(unit_price,(float,int)) and isinstance(quantity,(float,int)):
|
|
|
bool_query = BoolQuery(must_queries=[TermQuery("name",name),
|
|
|
RangeQuery("page_time",page_time_before,page_time_after,True,True),
|
|
@@ -734,12 +796,69 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
TermQuery("quantity",quantity)
|
|
|
])
|
|
|
|
|
|
- rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product","document_product_index",
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
SearchQuery(bool_query,limit=1),
|
|
|
columns_to_get=ColumnsToGet(["name",'brand','specs'],return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data = getRow_ots(rows)
|
|
|
if len(list_data)>0:
|
|
|
- return list_data[0].get(DOCUMENT_PRODUCT_ID),1
|
|
|
+ return list_data[0].get(DOCUMENT_PRODUCT_ID),0
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery(project_docids,str(docid)),
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("project2","project2_index",
|
|
|
+ SearchQuery(bool_query,limit=10),
|
|
|
+ ColumnsToGet([project_docids],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ set_docid = set()
|
|
|
+ for _data in list_data:
|
|
|
+ _docids = _data.get(project_docids,"")
|
|
|
+ for d_id in _docids.split(","):
|
|
|
+ d_id = d_id.strip()
|
|
|
+ if d_id!="":
|
|
|
+ set_docid.add(int(d_id))
|
|
|
+ if docid in set_docid:
|
|
|
+ set_docid.remove(docid)
|
|
|
+ should_q = [TermQuery(DOCUMENT_PRODUCT_DOCID,did) for did in set_docid]
|
|
|
+ if len(should_q)>0:
|
|
|
+ bool_query = BoolQuery(must_queries=[TermQuery("name",name),
|
|
|
+ BoolQuery(should_queries=should_q),
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,limit=50),
|
|
|
+ columns_to_get=ColumnsToGet(["docid",'name','brand','specs','unit_price','quantity'],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ dict_docid_name = {}
|
|
|
+ match_ids = []
|
|
|
+ for _data in list_data:
|
|
|
+ docid1 = _data.get(DOCUMENT_PRODUCT_DOCID)
|
|
|
+ name1 = _data.get(DOCUMENT_PRODUCT_NAME)
|
|
|
+ brand1 = _data.get(DOCUMENT_PRODUCT_BRAND)
|
|
|
+ specs1 = _data.get(DOCUMENT_PRODUCT_SPECS)
|
|
|
+ unit_price1 = _data.get(DOCUMENT_PRODUCT_UNIT_PRICE)
|
|
|
+ quantity1 = _data.get(DOCUMENT_PRODUCT_QUANTITY)
|
|
|
+ id = _data.get(DOCUMENT_PRODUCT_ID)
|
|
|
+ value_count1 = self.get_value_count(name1,brand1,specs1,unit_price1,quantity1)
|
|
|
+ if name1==name:
|
|
|
+ match_ids.append({DOCUMENT_PRODUCT_ID:id,"value_count":value_count1})
|
|
|
+ if docid1 not in dict_docid_name:
|
|
|
+ dict_docid_name[docid1] = []
|
|
|
+ dict_docid_name[docid1].append(name)
|
|
|
+ is_all_one = True
|
|
|
+ for k,v in dict_docid_name.items():
|
|
|
+ if len(v)!=1:
|
|
|
+ is_all_one = False
|
|
|
+ if is_all_one:
|
|
|
+ match_ids.sort(key=lambda x:x.get("value_count",0),reverse=True)
|
|
|
+ if len(match_ids)>0:
|
|
|
+ _id = match_ids[0].get(DOCUMENT_PRODUCT_ID)
|
|
|
+ value_count1 = match_ids[0]["value_count"]
|
|
|
+ if base_value_count<value_count1:
|
|
|
+ to_save = 0
|
|
|
+ for _match in match_ids:
|
|
|
+ list_dump_id.append(_match.get(DOCUMENT_PRODUCT_ID))
|
|
|
+
|
|
|
|
|
|
if len(name)>0 and len(brand)>0 and len(supplier)>0 and len(tenderee)>0:
|
|
|
# log("docid %s name %s page_time_before %s page_time_after %s brand %s supplier %s tenderee %s"%(str(docid),name,page_time_before,page_time_after,brand,supplier,tenderee))
|
|
@@ -750,11 +869,11 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
TermQuery(DOCUMENT_PRODUCT_SUPPLIER,supplier),
|
|
|
])
|
|
|
|
|
|
- rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product","document_product_index",
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
SearchQuery(bool_query,limit=50),
|
|
|
columns_to_get=ColumnsToGet(['name','brand','specs','unit_price','quantity'],return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data = getRow_ots(rows)
|
|
|
- value_count = self.get_value_count(name,brand,specs,unit_price,quantity)
|
|
|
+
|
|
|
|
|
|
for _d in list_data:
|
|
|
s_id = _d.get(DOCUMENT_PRODUCT_ID)
|
|
@@ -773,12 +892,10 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
check_flag = False
|
|
|
|
|
|
if check_flag:
|
|
|
- if value_count<value_count1:
|
|
|
+ if base_value_count<value_count1:
|
|
|
to_save = 0
|
|
|
- else:
|
|
|
- to_save = 1
|
|
|
- return s_id,to_save
|
|
|
- return None,1
|
|
|
+ list_dump_id.append(s_id)
|
|
|
+ return list_dump_id,to_save
|
|
|
|
|
|
|
|
|
def dumplicate(self,document_product):
|
|
@@ -791,18 +908,27 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
dump_id,to_save = self.dumplicate_search_product(document_product)
|
|
|
|
|
|
if dump_id is not None:
|
|
|
- document_product.setValue(DOCUMENT_PRODUCT_DUMP_ID,dump_id,True)
|
|
|
+ document_product.setValue(DOCUMENT_PRODUCT_DUMP_ID,str(dump_id),True)
|
|
|
|
|
|
if to_save==1:
|
|
|
if dump_id is not None:
|
|
|
- _d = {DOCUMENT_PRODUCT_ID:dump_id,
|
|
|
- DOCUMENT_PRODUCT_STATUS:randint(401,450),
|
|
|
- DOCUMENT_PRODUCT_DUMP_ID:document_product.getProperties().get(DOCUMENT_PRODUCT_ID)}
|
|
|
- _dp = Document_product(_d)
|
|
|
- _dp.update_row(self.ots_client)
|
|
|
- return True
|
|
|
+ if isinstance(dump_id,str):
|
|
|
+ _d = {DOCUMENT_PRODUCT_ID:dump_id,
|
|
|
+ DOCUMENT_PRODUCT_STATUS:randint(401,450),
|
|
|
+ DOCUMENT_PRODUCT_DUMP_ID:document_product.getProperties().get(DOCUMENT_PRODUCT_ID)}
|
|
|
+ _dp = Document_product(_d)
|
|
|
+ _dp.update_row(self.ots_client)
|
|
|
+ elif isinstance(dump_id,list):
|
|
|
+ for d_id in dump_id:
|
|
|
+ _d = {DOCUMENT_PRODUCT_ID:d_id,
|
|
|
+ DOCUMENT_PRODUCT_STATUS:randint(401,450),
|
|
|
+ DOCUMENT_PRODUCT_DUMP_ID:document_product.getProperties().get(DOCUMENT_PRODUCT_ID)}
|
|
|
+ _dp = Document_product(_d)
|
|
|
+ _dp.update_row(self.ots_client)
|
|
|
+
|
|
|
+ return True,dump_id
|
|
|
else:
|
|
|
- return False
|
|
|
+ return False,dump_id
|
|
|
|
|
|
def start_processing(self):
|
|
|
scheduler = BlockingScheduler()
|
|
@@ -848,11 +974,23 @@ def fix_product_data():
|
|
|
'''
|
|
|
table_name = "document_product_temp"
|
|
|
table_index = "document_product_temp_index"
|
|
|
- columns = [DOCUMENT_PRODUCT_TMP_WIN_BID_PRICE]
|
|
|
+ columns = [DOCUMENT_PRODUCT_TMP_NEW_ID,DOCUMENT_PRODUCT_TMP_STATUS]
|
|
|
+
|
|
|
+
|
|
|
+ table_name = Document_product_table_name
|
|
|
+ table_index = Document_product_table_name+"_index"
|
|
|
+ columns = [DOCUMENT_PRODUCT_ORIGINAL_ID]
|
|
|
+
|
|
|
+
|
|
|
ots_client = getConnect_ots()
|
|
|
- bool_query = BoolQuery(must_queries=[
|
|
|
- RangeQuery("status",501),
|
|
|
+ bool_query = BoolQuery(should_queries=[
|
|
|
+ # RangeQuery("status",501),
|
|
|
# TermQuery("docid",246032980)
|
|
|
+
|
|
|
+ RangeQuery("status",401,501),
|
|
|
+ # RangeQuery("status",401,451)
|
|
|
+ # WildcardQuery(DOCUMENT_PRODUCT_ORIGINAL_SPECS,"MFUSOne")
|
|
|
+ # TermQuery(DOCUMENT_PRODUCT_SPECS,"MFUSOne")
|
|
|
])
|
|
|
|
|
|
rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
|
|
@@ -860,6 +998,7 @@ def fix_product_data():
|
|
|
columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
|
|
list_rows = getRow_ots(rows)
|
|
|
+ print(total_count)
|
|
|
while next_token:
|
|
|
rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
|
|
|
SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
@@ -876,6 +1015,8 @@ def fix_product_data():
|
|
|
def fix_missing_data(item,result_queue):
|
|
|
|
|
|
original_id = item.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
|
|
|
+
|
|
|
+ print("original_id",original_id)
|
|
|
_d = {DOCUMENT_PRODUCT_TMP_ID:original_id,DOCUMENT_PRODUCT_TMP_STATUS:1}
|
|
|
dpt = Document_product_tmp(_d)
|
|
|
dpt.fix_columns(ots_client,["name","brand","specs"],True)
|
|
@@ -905,15 +1046,23 @@ def fix_product_data():
|
|
|
|
|
|
def deleteAndReprocess(item,result_queue):
|
|
|
|
|
|
+ original_id = item.get(DOCUMENT_PRODUCT_TMP_ID)
|
|
|
+ new_id = item.get(DOCUMENT_PRODUCT_TMP_NEW_ID)
|
|
|
+
|
|
|
original_id = item.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
|
|
|
+ new_id = item.get(DOCUMENT_PRODUCT_ID)
|
|
|
+
|
|
|
+ print("original_id",original_id,"id",item.get(DOCUMENT_PRODUCT_ID))
|
|
|
# delete data and rerun
|
|
|
_d = {DOCUMENT_PRODUCT_TMP_ID:original_id,DOCUMENT_PRODUCT_TMP_STATUS:1}
|
|
|
dpt = Document_product_tmp(_d)
|
|
|
dpt.update_row(ots_client)
|
|
|
|
|
|
- _d = {DOCUMENT_PRODUCT_ID:item.get(DOCUMENT_PRODUCT_ID)}
|
|
|
- dp = Document_product(_d)
|
|
|
- dp.delete_row(ots_client)
|
|
|
+
|
|
|
+ if new_id is not None and new_id!="":
|
|
|
+ _d = {DOCUMENT_PRODUCT_ID:new_id}
|
|
|
+ dp = Document_product(_d)
|
|
|
+ dp.delete_row(ots_client)
|
|
|
|
|
|
def handle(item,result_queue):
|
|
|
win_bid_price = item.get(DOCUMENT_PRODUCT_TMP_WIN_BID_PRICE,1)
|
|
@@ -922,8 +1071,7 @@ def fix_product_data():
|
|
|
dpt.setValue(DOCUMENT_PRODUCT_TMP_STATUS,1,True)
|
|
|
dpt.update_row(ots_client)
|
|
|
|
|
|
-
|
|
|
- mt = MultiThreadHandler(task_queue,handle,None,30,1)
|
|
|
+ mt = MultiThreadHandler(task_queue,deleteAndReprocess,None,30,1)
|
|
|
mt.run()
|
|
|
|
|
|
def test_check_brand():
|
|
@@ -970,37 +1118,373 @@ def test_check_brand():
|
|
|
else:
|
|
|
brand = _d.get("brand")
|
|
|
list_illegal_brand.append(brand)
|
|
|
- with open("legal_brand.txt","w",encoding="utf8") as f:
|
|
|
+ with open("../../test/legal_brand.txt", "w", encoding="utf8") as f:
|
|
|
for b in list_legal_brand:
|
|
|
f.write(b+"\n")
|
|
|
- with open("illegal_brand.txt","w",encoding="utf8") as f:
|
|
|
+ with open("../../test/illegal_brand.txt", "w", encoding="utf8") as f:
|
|
|
for b in list_illegal_brand:
|
|
|
f.write(b+"\n")
|
|
|
|
|
|
def test_match():
|
|
|
- a = "Mini-7"
|
|
|
- vector = request_embedding(a)
|
|
|
+ a = "桂林市啄木鸟医疗器械有限公司"
|
|
|
+
|
|
|
+ # vector = request_embedding(get_milvus_standard_name(a))
|
|
|
+ # vector = [get_embedding_request(b) for b in a]
|
|
|
+ pm = Product_Manager()
|
|
|
+ _GRADE = BRAND_GRADE
|
|
|
+ Coll,_ = pm.get_collection(_GRADE)
|
|
|
+ print(Coll.name)
|
|
|
+
|
|
|
+ output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]
|
|
|
+ # start_time = time.time()
|
|
|
+ _id = get_milvus_product_dict_id(a)
|
|
|
+ print(Coll.query(expr=" ots_id in ['%s'] "%(_id),output_fields=output_fields))
|
|
|
+ # print("cost",time.time()-start_time)
|
|
|
+ # print(Coll.compact())
|
|
|
+ # result = search_embedding(Coll,embedding_index_name,[vector],pm.search_params,output_fields,limit=20)
|
|
|
+ #
|
|
|
+ # final_list = []
|
|
|
+ # for _search in result:
|
|
|
+ # _d = {}
|
|
|
+ # for k in output_fields:
|
|
|
+ # _d[k] = _search.entity.get(k)
|
|
|
+ # final_list.append(_d)
|
|
|
+ # final_list = remove_repeat_item(final_list,k="ots_name")
|
|
|
+
|
|
|
+ start_time = time.time()
|
|
|
+ # final_list = get_embedding_search(Coll,embedding_index_name,a,_GRADE,vector,pm.search_params,output_fields,limit=5)
|
|
|
+ final_list = get_intellect_search(Coll,embedding_index_name,a,_GRADE,pm.search_params,output_fields,limit=10)
|
|
|
+ for _search in final_list:
|
|
|
+ ots_id = _search.get("standard_name_id")
|
|
|
+ ots_name = _search.get("ots_name")
|
|
|
+ standard_name = _search.get("standard_name")
|
|
|
+ ots_parent_id = _search.get("ots_parent_id")
|
|
|
+ remove_words = _search.get("remove_words")
|
|
|
+ if check_brand(a,ots_name,remove_words):
|
|
|
+ print("similar",a,ots_name)
|
|
|
+ else:
|
|
|
+ print("not similar",a,ots_name)
|
|
|
+
|
|
|
+ print("cost",time.time()-start_time)
|
|
|
+ print(final_list)
|
|
|
+
|
|
|
+
|
|
|
+def rebuild_milvus():
|
|
|
+
|
|
|
+ pdm = Product_Dict_Manager()
|
|
|
+ from multiprocessing import Queue as PQueue
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ RangeQuery(DOCUMENT_PRODUCT_DICT_GRADE,3)
|
|
|
+ ])
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("name")]),limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet([DOCUMENT_PRODUCT_DICT_GRADE,DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet([DOCUMENT_PRODUCT_DICT_GRADE,DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+ list_data.extend(getRow_ots(rows))
|
|
|
+ print("%d/%d"%(len(list_data),total_count))
|
|
|
+
|
|
|
+ # if len(list_data)>1000:
|
|
|
+ # break
|
|
|
+
|
|
|
+ set_name_grade = set()
|
|
|
+ task_queue = PQueue()
|
|
|
+ for _data in list_data:
|
|
|
+ name = _data.get(DOCUMENT_PRODUCT_DICT_NAME)
|
|
|
+ grade = _data.get(DOCUMENT_PRODUCT_DICT_GRADE)
|
|
|
+ _key = "%s--%d"%(name,grade)
|
|
|
+ if _key not in set_name_grade:
|
|
|
+ task_queue.put(_data)
|
|
|
+ set_name_grade.add(_key)
|
|
|
+
|
|
|
+ log("rebuild milvus %d counts"%(task_queue.qsize()))
|
|
|
+ def insert_into_milvus(item,result_queue):
|
|
|
+ name = item.get(DOCUMENT_PRODUCT_DICT_NAME,"")
|
|
|
+ grade = item.get(DOCUMENT_PRODUCT_DICT_GRADE)
|
|
|
+
|
|
|
+ if grade==SPECS_GRADE:
|
|
|
+ name = clean_product_specs(name)
|
|
|
+ if len(name)<2:
|
|
|
+ return
|
|
|
+ if len(name)<2:
|
|
|
+ return
|
|
|
+
|
|
|
+
|
|
|
+ parent_id = item.get(DOCUMENT_PRODUCT_DICT_PARENT_ID,"")
|
|
|
+
|
|
|
+ Coll,_ = pdm.get_collection(grade)
|
|
|
+ standard_alias = item.get(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,"")
|
|
|
+
|
|
|
+ log("insert name %s grade %d"%(name,grade))
|
|
|
+ remove_words = item.get(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,"")
|
|
|
+ level = item.get(DOCUMENT_PRODUCT_DICT_LEVEL)
|
|
|
+ if level is None:
|
|
|
+ if re.search("装置|设备",name) is not None:
|
|
|
+ level = 2
|
|
|
+ else:
|
|
|
+ level = 1
|
|
|
+ insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_words,level)
|
|
|
+
|
|
|
+ def start_thread():
|
|
|
+ mt = MultiThreadHandler(task_queue,insert_into_milvus,None,5)
|
|
|
+ mt.run()
|
|
|
+ p_count = 5
|
|
|
+ list_p = []
|
|
|
+ for i in range(p_count):
|
|
|
+ p = Process(target=start_thread)
|
|
|
+ list_p.append(p)
|
|
|
+ for p in list_p:
|
|
|
+ p.start()
|
|
|
+ for p in list_p:
|
|
|
+ p.join()
|
|
|
+
|
|
|
+def move_document_product():
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ ExistsQuery(DOCUMENT_PRODUCT_NAME)
|
|
|
+ ])
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ Document_product_table_name = "document_product"
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("name")]),limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
+ list_data.extend(getRow_ots(rows))
|
|
|
+ print("%d/%d"%(len(list_data),total_count))
|
|
|
+ # if len(list_data)>=1000:
|
|
|
+ # break
|
|
|
+
|
|
|
+ task_queue = Queue()
|
|
|
+
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+
|
|
|
+ def _handle(item,result_queue):
|
|
|
+
|
|
|
+ D1 = Document_product(item)
|
|
|
+ D1.update_row(ots_client)
|
|
|
+
|
|
|
+ D1.table_name = Document_product_table_name
|
|
|
+ D1.delete_row(ots_client)
|
|
|
+
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+current_path = os.path.dirname(__file__)
|
|
|
+def delete_brands():
|
|
|
+ filename = os.path.join(current_path,"illegal_brand.txt")
|
|
|
+
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ list_brand = []
|
|
|
+ with open(filename,"r",encoding="utf8") as f:
|
|
|
+ while 1:
|
|
|
+ brand = f.readline()
|
|
|
+ if not brand:
|
|
|
+ break
|
|
|
+ brand = brand.strip()
|
|
|
+ list_brand.append(brand)
|
|
|
+
|
|
|
pm = Product_Manager()
|
|
|
- Coll,_ = pm.get_collection(NAME_GRADE)
|
|
|
- output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]
|
|
|
- search_list = search_embedding(Coll,embedding_index_name,[vector],pm.search_params,output_fields,limit=60)
|
|
|
- print(search_list)
|
|
|
+ Coll,_ = pm.get_collection(BRAND_GRADE)
|
|
|
+
|
|
|
+ print(Coll.name)
|
|
|
+ Coll.compact()
|
|
|
+ _count = 0
|
|
|
+
|
|
|
+ task_queue = Queue()
|
|
|
+ for brand in list_brand:
|
|
|
+ _count += 1
|
|
|
+ task_queue.put(brand)
|
|
|
+ # if _count>=2:
|
|
|
+ # break
|
|
|
+
|
|
|
+ def _handle(brand,result_queue):
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,BRAND_GRADE),
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_NAME,brand)
|
|
|
+ ])
|
|
|
+
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ _id = get_milvus_product_dict_id(brand)
|
|
|
+
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ list_data.extend(getRow_ots(rows))
|
|
|
+ for _d in list_data:
|
|
|
+ dpd = Document_product_dict(_d)
|
|
|
+ dpd.delete_row(ots_client)
|
|
|
+ # print(Coll.query(expr=" ots_id in ['%s']"%(_id),output_fields=["ots_id","ots_name"]))
|
|
|
+ delete_counts = Coll.delete(expr=" ots_id in ['%s']"%(_id)).delete_count
|
|
|
+
|
|
|
+ log("brand %s total_count %d md5:%s delete_counts:%d"%(brand,total_count,_id,delete_counts))
|
|
|
+
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def delete_specs():
|
|
|
+ filename = os.path.join(current_path,"illegal_specs.txt")
|
|
|
+
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ list_brand = []
|
|
|
+ with open(filename,"r",encoding="utf8") as f:
|
|
|
+ while 1:
|
|
|
+ brand = f.readline()
|
|
|
+ if not brand:
|
|
|
+ break
|
|
|
+ brand = brand.strip()
|
|
|
+ list_brand.append(brand)
|
|
|
+
|
|
|
+ pm = Product_Manager()
|
|
|
+ Coll,_ = pm.get_collection(SPECS_GRADE)
|
|
|
+ print(Coll.name)
|
|
|
+ Coll.compact()
|
|
|
+
|
|
|
+ _count = 0
|
|
|
+ task_queue = Queue()
|
|
|
+
|
|
|
+ for specs in list_brand:
|
|
|
+ task_queue.put(specs)
|
|
|
+ _count += 1
|
|
|
+ # if _count>=2:
|
|
|
+ # break
|
|
|
+
|
|
|
+ def _handle(specs,result_queue):
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,SPECS_GRADE),
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_NAME,specs)
|
|
|
+ ])
|
|
|
+
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ _id = get_milvus_product_dict_id(specs)
|
|
|
+
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ list_data.extend(getRow_ots(rows))
|
|
|
+ for _d in list_data:
|
|
|
+ dpd = Document_product_dict(_d)
|
|
|
+ dpd.delete_row(ots_client)
|
|
|
+ # print(Coll.query(expr=" ots_id in ['%s']"%(_id),output_fields=["ots_id","ots_name"]))
|
|
|
+ delete_counts = Coll.delete(expr=" ots_id in ['%s']"%(_id)).delete_count
|
|
|
+
|
|
|
+ log("brand %s total_count %d md5:%s delete_counts:%d"%(specs,total_count,_id,delete_counts))
|
|
|
+
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
+ mt.run()
|
|
|
+ Coll.compact()
|
|
|
+
|
|
|
+def remove_redis_keys():
|
|
|
+ db = redis.Redis(connection_pool=pool_product)
|
|
|
+ db.flushdb()
|
|
|
+
|
|
|
+
|
|
|
+def update_document_product_dict():
|
|
|
+ import pandas as pd
|
|
|
+ filename = "update_product.csv"
|
|
|
+ df = pd.read_csv(filename,encoding="gbk")
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ for name,grade,standard_alias,remove_words,level in zip(df["name"],df["grade"],df["standard_alias"],df["remove_words"],df["level"]):
|
|
|
+ name = name.strip()
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_NAME,name),
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,grade)
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,get_total_count=True),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ if total_count==1:
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ _data = list_data[0]
|
|
|
+ dpd = Document_product_dict(_data)
|
|
|
+ level = 1
|
|
|
+ if re.search("器械|设备|其他",name) is not None and level==1:
|
|
|
+ level = 2
|
|
|
+ if str(remove_words)=="nan":
|
|
|
+ remove_words = ""
|
|
|
+ dpd.setValue(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,standard_alias,True)
|
|
|
+ dpd.setValue(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,remove_words,True)
|
|
|
+ dpd.setValue(DOCUMENT_PRODUCT_DICT_LEVEL,level,True)
|
|
|
+ dpd.setValue(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED,IS_SYNCHONIZED+1,True)
|
|
|
+ dpd.update_row(ots_client)
|
|
|
+ print(dpd.getProperties())
|
|
|
+
|
|
|
|
|
|
|
|
|
def test():
|
|
|
# pm = Product_Manager()
|
|
|
# pm.test()
|
|
|
- fix_product_data()
|
|
|
+ # fix_product_data()
|
|
|
# test_check_brand()
|
|
|
- # test_match()
|
|
|
+ test_match()
|
|
|
+ # rebuild_milvus()
|
|
|
+
|
|
|
+ # move_document_product()
|
|
|
+ # delete_brands()
|
|
|
+ # delete_specs()
|
|
|
+ # remove_redis_keys()
|
|
|
+ # update_document_product_dict()
|
|
|
+
|
|
|
+def clean_product_dict_interface():
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ BoolQuery(should_queries=[
|
|
|
+ TermQuery("action","insert"),
|
|
|
+ TermQuery("action","base")
|
|
|
+ ])
|
|
|
+ ])
|
|
|
+ task_queue = Queue()
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),get_total_count=True,limit=100),
|
|
|
+ columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+ print("%d/%d"%(task_queue.qsize(),total_count))
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
|
|
|
+ columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+ print("%d/%d"%(task_queue.qsize(),total_count))
|
|
|
+
|
|
|
+ def _handle(item,result_queue):
|
|
|
+ _dpd = Document_product_dict_interface(item)
|
|
|
+ _dpd.delete_row(ots_client)
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
+ mt.run()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
+ # test()
|
|
|
# start_process_product()
|
|
|
# print(getMD5('11936c56f2dd1426764e317ca2e8e1a7'+'&&鱼跃'))
|
|
|
- test()
|
|
|
- print(Product_Manager.get_bid_filemd5s(155415770,getConnect_ots()))
|
|
|
- name = "一"
|
|
|
- ots_name = "一氧化碳分析仪"
|
|
|
- print(is_similar(name,ots_name),check_product(name,ots_name))
|
|
|
- print(is_legal_specs('SCM-A/SB(0.18D)'))
|
|
|
+ # print(Product_Manager.get_bid_filemd5s(155415770,getConnect_ots()))
|
|
|
+ # name = "一"
|
|
|
+ # ots_name = "一氧化碳分析仪"
|
|
|
+ # print(is_similar(name,ots_name),check_product(name,ots_name))
|
|
|
+ # print(is_legal_specs('SCM-A/SB(0.18D)'))
|
|
|
+ clean_product_dict_interface()
|