|
@@ -143,7 +143,7 @@ class Product_Dict_Manager():
|
|
|
remove_words = item.get(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,"")
|
|
|
level = item.get(DOCUMENT_PRODUCT_DICT_LEVEL,1)
|
|
|
|
|
|
- if insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_words,level):
|
|
|
+ if insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_words,level,wait_sync=False):
|
|
|
|
|
|
_pd = Document_product_dict_interface({DOCUMENT_PRODUCT_DICT_ID:_id,DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED})
|
|
|
_pd.update_row(self.ots_client)
|
|
@@ -250,6 +250,7 @@ class Product_Dict_Manager():
|
|
|
SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
|
|
|
columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data = getRow_ots(rows)
|
|
|
+ log("insert brand %s %d counts"%(name,total_count))
|
|
|
while next_token:
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
@@ -361,25 +362,24 @@ class Product_Dict_Manager():
|
|
|
])
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
|
|
|
- columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_NAME_ID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_BRAND_ID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data = getRow_ots(rows)
|
|
|
while next_token:
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
- columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_NAME_ID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_BRAND_ID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data.extend(getRow_ots(rows))
|
|
|
for _d in list_data:
|
|
|
- dict_name_id = _d.get(DOCUMENT_PRODUCT_DICT_NAME_ID)
|
|
|
- if dict_name_id is not None and dict_name_id!="":
|
|
|
+ dict_brand_id = _d.get(DOCUMENT_PRODUCT_DICT_NAME_ID)
|
|
|
+ if dict_brand_id is not None and dict_brand_id!="":
|
|
|
_query = BoolQuery(must_queries=[
|
|
|
- TermQuery(DOCUMENT_PRODUCT_DICT_NAME_ID,dict_name_id)
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_BRAND_ID,dict_brand_id)
|
|
|
])
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
SearchQuery(_query,get_total_count=True))
|
|
|
if total_count==1:
|
|
|
- dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_name_id})
|
|
|
- self.recurse_delete_dict(dict_name_id)
|
|
|
- dpd.delete_row(self.ots_client)
|
|
|
+ dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_brand_id})
|
|
|
+ self.recurse_delete_dict(dict_brand_id)
|
|
|
|
|
|
_id = _d.get(DOCUMENT_PRODUCT_ID)
|
|
|
original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
|
|
@@ -519,28 +519,27 @@ class Product_Dict_Manager():
|
|
|
])
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
|
|
|
- columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_NAME_ID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_BRAND_ID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data = getRow_ots(rows)
|
|
|
|
|
|
|
|
|
while next_token:
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
- columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_NAME_ID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_BRAND_ID],return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data.extend(getRow_ots(rows))
|
|
|
|
|
|
for _d in list_data:
|
|
|
- dict_name_id = _d.get(DOCUMENT_PRODUCT_DICT_NAME_ID)
|
|
|
- if dict_name_id is not None and dict_name_id!="":
|
|
|
+ dict_brand_id = _d.get(DOCUMENT_PRODUCT_DICT_NAME_ID)
|
|
|
+ if dict_brand_id is not None and dict_brand_id!="":
|
|
|
_query = BoolQuery(must_queries=[
|
|
|
- TermQuery(DOCUMENT_PRODUCT_DICT_NAME_ID,dict_name_id)
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_BRAND_ID,dict_brand_id)
|
|
|
])
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
|
|
|
SearchQuery(_query,get_total_count=True))
|
|
|
if total_count==1:
|
|
|
- dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_name_id})
|
|
|
- self.recurse_delete_dict(dict_name_id)
|
|
|
- dpd.delete_row(self.ots_client)
|
|
|
+ dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_brand_id})
|
|
|
+ self.recurse_delete_dict(dict_brand_id)
|
|
|
|
|
|
_id = _d.get(DOCUMENT_PRODUCT_ID)
|
|
|
original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
|
|
@@ -671,7 +670,7 @@ class Product_Dict_Manager():
|
|
|
|
|
|
# search interface if name and grade exists then update document_product_dict and return
|
|
|
|
|
|
- interface_id = get_milvus_product_dict_id(name)
|
|
|
+ interface_id = get_document_product_dict_interface_base_id(name,grade)
|
|
|
_interface_d = {
|
|
|
DOCUMENT_PRODUCT_DICT_INTERFACE_ID:interface_id,
|
|
|
DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:alias,
|
|
@@ -705,7 +704,7 @@ class Product_Dict_Manager():
|
|
|
if _alias==name:
|
|
|
continue
|
|
|
list_name.append(_alias)
|
|
|
- time.sleep(1)
|
|
|
+ time.sleep(PRODUCT_REDIS_CACHE_TIME)
|
|
|
|
|
|
|
|
|
#judge whether there exists records before this record created,if not process the history data
|
|
@@ -847,7 +846,7 @@ class Product_Dict_Manager():
|
|
|
def act_update(self,name,alias,grade,original_id,parent_id,standard_alias,create_time,remove_words,level):
|
|
|
# check whether there are change variable
|
|
|
|
|
|
- _interface_id = get_milvus_product_dict_id(name)
|
|
|
+ _interface_id = get_document_product_dict_interface_base_id(name,grade)
|
|
|
_d = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:_interface_id}
|
|
|
_dpdi = Document_product_dict_interface(_d)
|
|
|
if not _dpdi.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS,DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS,DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS,DOCUMENT_PRODUCT_DICT_INTERFACE_LEVEL],True):
|
|
@@ -863,7 +862,7 @@ class Product_Dict_Manager():
|
|
|
if not update_flag:
|
|
|
return
|
|
|
|
|
|
- interface_id = get_milvus_product_dict_id(name)
|
|
|
+ interface_id = get_document_product_dict_interface_base_id(name,grade)
|
|
|
final_alias = DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR.join(list(new_alias_set))
|
|
|
final_standard_alias = DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR.join(list(new_standard_alias_set))
|
|
|
final_remove_words = DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR.join(list(new_remove_words_set))
|
|
@@ -888,6 +887,8 @@ class Product_Dict_Manager():
|
|
|
for _name in delete_standard_names:
|
|
|
delete_record_from_milvus(Coll,_name,"")
|
|
|
|
|
|
+ time.sleep(PRODUCT_REDIS_CACHE_TIME)
|
|
|
+
|
|
|
# update document_product_dict
|
|
|
# update alias
|
|
|
if len(new_alias_set&original_alias_set)!=len(new_alias_set):
|
|
@@ -934,6 +935,14 @@ class Product_Dict_Manager():
|
|
|
log("delete id:%s"%(_id))
|
|
|
self.recurse_delete_dict(_id)
|
|
|
dpd.delete_row(self.ots_client)
|
|
|
+ face_id = get_document_product_dict_interface_base_id(_name,grade)
|
|
|
+ _interface_d = {
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ID:face_id,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:randint(401,451)
|
|
|
+ }
|
|
|
+ _dpdi = Document_product_dict_interface(_interface_d)
|
|
|
+ if _dpdi.exists_row(self.ots_client):
|
|
|
+ _dpdi.update_row(self.ots_client)
|
|
|
|
|
|
# process history
|
|
|
if len(delete_standard_names)>0:
|
|
@@ -1012,7 +1021,7 @@ class Product_Dict_Manager():
|
|
|
columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
list_data.extend(getRow_ots(rows))
|
|
|
|
|
|
- interface_id = get_milvus_product_dict_id(name)
|
|
|
+ interface_id = get_document_product_dict_interface_base_id(name,grade)
|
|
|
|
|
|
|
|
|
|
|
@@ -1020,7 +1029,7 @@ class Product_Dict_Manager():
|
|
|
Coll,_ = self.get_collection(grade)
|
|
|
|
|
|
delete_record_from_milvus(Coll,name,standard_alias)
|
|
|
- time.sleep(1)
|
|
|
+ time.sleep(PRODUCT_REDIS_CACHE_TIME)
|
|
|
|
|
|
#process_history data
|
|
|
self.process_history_by_name([name],grade,"delete")
|
|
@@ -1329,7 +1338,7 @@ def clean_similar():
|
|
|
|
|
|
|
|
|
|
|
|
-def insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_words="",level=1):
|
|
|
+def insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_words="",level=1,wait_sync=True):
|
|
|
|
|
|
n_name = get_milvus_standard_name(name)
|
|
|
name_id = get_milvus_product_dict_id(n_name)
|
|
@@ -1379,25 +1388,28 @@ def insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_
|
|
|
[level]
|
|
|
]
|
|
|
insert_embedding(Coll,data)
|
|
|
- while 1:
|
|
|
- try:
|
|
|
- log("milvus insert wait for done")
|
|
|
- list_result = Coll.query(expr=expr,output_fields=["standard_name"])
|
|
|
- log("list_result"+str(list_result)+str(type(list_result[0])))
|
|
|
- if len(list_result)==1:
|
|
|
- if list_result[0].get("standard_name","")==name:
|
|
|
- log("milvus insert done")
|
|
|
- return True
|
|
|
- time.sleep(1)
|
|
|
- except Exception as e:
|
|
|
- traceback.print_exc()
|
|
|
+ if wait_sync:
|
|
|
+ while 1:
|
|
|
+ try:
|
|
|
+ log("milvus insert wait for done")
|
|
|
+ list_result = Coll.query(expr=expr,output_fields=["standard_name"])
|
|
|
+ log("list_result"+str(list_result)+str(type(list_result[0])))
|
|
|
+ if len(list_result)==1:
|
|
|
+ if list_result[0].get("standard_name","")==name:
|
|
|
+ log("milvus insert done")
|
|
|
+ return True
|
|
|
+ time.sleep(1)
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+ else:
|
|
|
+ return True
|
|
|
|
|
|
def delete_record_from_milvus(Coll,name,standard_alias):
|
|
|
|
|
|
n_name = get_milvus_standard_name(name)
|
|
|
name_id = get_milvus_product_dict_id(n_name)
|
|
|
|
|
|
- log("delete name %s grade %s"%(str(name),str(standard_alias)))
|
|
|
+ log("delete name %s standard_alias %s"%(str(name),str(standard_alias)))
|
|
|
|
|
|
expr = " ots_id in ['%s']"%name_id
|
|
|
Coll.delete(expr)
|
|
@@ -1435,13 +1447,44 @@ def dict_interface_delete(name,grade,ots_client = getConnect_ots()):
|
|
|
dpdi.update_row(ots_client)
|
|
|
|
|
|
|
|
|
-def interface_deletes():
|
|
|
+def interface_insert():
|
|
|
+ from uuid import uuid4
|
|
|
a = '''
|
|
|
- 按采购需求执行
|
|
|
'''
|
|
|
grade = 4
|
|
|
+
|
|
|
+ new_standard_alias = ""
|
|
|
+ new_remove_words = ""
|
|
|
+
|
|
|
+ list_brand = []
|
|
|
ots_client=getConnect_ots()
|
|
|
for s in re.split("[\n\s,.,。、]",a):
|
|
|
+ s = s.strip()
|
|
|
+ if s=="":
|
|
|
+ continue
|
|
|
+ list_brand.append(s)
|
|
|
+ grade = 4
|
|
|
+
|
|
|
+ for brand in list_brand:
|
|
|
+ print(brand)
|
|
|
+ _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:brand,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:grade,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"insert",
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS:new_standard_alias,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS:new_remove_words,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S")}
|
|
|
+ dpdi = Document_product_dict_interface(_d)
|
|
|
+ dpdi.update_row(ots_client)
|
|
|
+
|
|
|
+def interface_deletes():
|
|
|
+ a = '''
|
|
|
+ 株式会社
|
|
|
+ '''
|
|
|
+ grade = 4
|
|
|
+ ots_client=getConnect_ots()
|
|
|
+ for s in re.split("[\n\s]",a):
|
|
|
s = s.strip()
|
|
|
if s=="":
|
|
|
continue
|
|
@@ -1449,9 +1492,9 @@ def interface_deletes():
|
|
|
dict_interface_delete(s,grade,ots_client)
|
|
|
|
|
|
def interface_update():
|
|
|
- name = "保健"
|
|
|
- new_standard_alias = ""
|
|
|
- new_remove_words = "+设备"
|
|
|
+ name = "万东"
|
|
|
+ new_standard_alias = "+万东康源|北京万东"
|
|
|
+ new_remove_words = ""
|
|
|
grade = 4
|
|
|
ots_client = getConnect_ots()
|
|
|
|
|
@@ -1468,6 +1511,68 @@ def interface_update():
|
|
|
dpdi = Document_product_dict_interface(_d)
|
|
|
dpdi.update_row(ots_client)
|
|
|
|
|
|
+def interface_brand_update_by_file():
|
|
|
+ import pandas as pd
|
|
|
+ import re
|
|
|
+ filename = "../../test/品牌合并.xlsx"
|
|
|
+ df0 = pd.read_excel(filename,0)
|
|
|
+ df1 = pd.read_excel(filename,1)
|
|
|
+ set_source_brand = set()
|
|
|
+ for b in df0["brands"]:
|
|
|
+ if b is None or b=="":
|
|
|
+ continue
|
|
|
+ list_brand = b.split(",")
|
|
|
+ for brand in list_brand:
|
|
|
+ brand = brand.strip()
|
|
|
+ if brand=="":
|
|
|
+ continue
|
|
|
+ set_source_brand.add(brand)
|
|
|
+ target_brand = df1["brand"]
|
|
|
+ target_standard_alias = df1["standard_alias"]
|
|
|
+ _check_flag = True
|
|
|
+ list_target = []
|
|
|
+ for tbrand,standard_alias in zip(target_brand,target_standard_alias):
|
|
|
+ brand = tbrand.strip()
|
|
|
+ if brand not in set_source_brand:
|
|
|
+ print("not in source:%s"%(brand))
|
|
|
+ _check_flag = False
|
|
|
+ if standard_alias is None or standard_alias=="" or str(standard_alias)=="nan":
|
|
|
+ continue
|
|
|
+ list_brand = re.split("[,,]",standard_alias)
|
|
|
+ set_alias = set()
|
|
|
+ for brand in list_brand:
|
|
|
+ brand = brand.strip()
|
|
|
+ if brand=="":
|
|
|
+ continue
|
|
|
+ if brand not in set_source_brand:
|
|
|
+ print("not in source:%s"%(brand))
|
|
|
+ _check_flag = False
|
|
|
+ set_alias.add(brand)
|
|
|
+ _d = {"brand":tbrand.strip(),
|
|
|
+ "standard_alias":"+"+"|".join(list(set_alias))}
|
|
|
+ list_target.append(_d)
|
|
|
+
|
|
|
+
|
|
|
+ if _check_flag or 1:
|
|
|
+ grade = 4
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ from uuid import uuid4
|
|
|
+ for target in list_target:
|
|
|
+ name = target["brand"]
|
|
|
+ new_standard_alias = target["standard_alias"]
|
|
|
+ _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:name,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:grade,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"update",
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS:new_standard_alias,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS:"",
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S")}
|
|
|
+ dpdi = Document_product_dict_interface(_d)
|
|
|
+ dpdi.update_row(ots_client)
|
|
|
+ print(list_target)
|
|
|
+
|
|
|
+
|
|
|
def clean_brands():
|
|
|
from queue import Queue as TQueue
|
|
|
task_queue = TQueue()
|
|
@@ -1475,13 +1580,17 @@ def clean_brands():
|
|
|
|
|
|
list_data = []
|
|
|
|
|
|
- columns=[DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_PARENT_ID,DOCUMENT_PRODUCT_DICT_GRADE]
|
|
|
+ table_name = Document_product_dict_interface_table_name
|
|
|
+ table_index = table_name+"_index"
|
|
|
+
|
|
|
+ columns=[DOCUMENT_PRODUCT_DICT_INTERFACE_NAME,DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE]
|
|
|
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
- RangeQuery(DOCUMENT_PRODUCT_DICT_GRADE,4,4,True,True),
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_BASE),
|
|
|
+ RangeQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE,4,4,True,True),
|
|
|
])
|
|
|
|
|
|
- rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
|
|
|
SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED)]),limit=100,get_total_count=True),
|
|
|
columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
|
|
|
|
|
@@ -1490,7 +1599,7 @@ def clean_brands():
|
|
|
list_data.append(_d)
|
|
|
|
|
|
while next_token:
|
|
|
- rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
|
|
|
SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
|
|
|
list_dict = getRow_ots(rows)
|
|
@@ -1502,6 +1611,7 @@ def clean_brands():
|
|
|
|
|
|
set_key = set()
|
|
|
list_process_data = []
|
|
|
+ set_brand = set()
|
|
|
for _d in list_data:
|
|
|
name = _d.get(DOCUMENT_PRODUCT_DICT_NAME)
|
|
|
grade = _d.get(DOCUMENT_PRODUCT_DICT_GRADE)
|
|
@@ -1511,21 +1621,36 @@ def clean_brands():
|
|
|
set_key.add(_key)
|
|
|
task_queue.put(_d)
|
|
|
list_process_data.append(_d)
|
|
|
+ if grade==BRAND_GRADE:
|
|
|
+ set_brand.add(name)
|
|
|
def _handle(item,result_queue):
|
|
|
name = item.get(DOCUMENT_PRODUCT_DICT_NAME)
|
|
|
|
|
|
+ _legal = is_legal_brand(ots_client,name)
|
|
|
if is_legal_brand(ots_client,name):
|
|
|
item["legal"] = 1
|
|
|
+ elif _legal==False:
|
|
|
+ item["legal"] = 0
|
|
|
else:
|
|
|
+ item["legal"] = 0
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
TermQuery("brand",name)
|
|
|
])
|
|
|
rows,next_token,total_count,is_all_succeed = ots_client.search("document_product","document_product_index",
|
|
|
SearchQuery(bool_query,get_total_count=True))
|
|
|
- if total_count>0:
|
|
|
+ if total_count>=2:
|
|
|
item["legal"] = 1
|
|
|
else:
|
|
|
- item["legal"] = 0
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ NestedQuery("products",WildcardQuery("products.brand",name)),
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
+ SearchQuery(bool_query,get_total_count=True))
|
|
|
+ if total_count>=1:
|
|
|
+ item["legal"] = 1
|
|
|
+ else:
|
|
|
+ item["legal"] = 0
|
|
|
+
|
|
|
mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
mt.run()
|
|
|
|
|
@@ -1545,10 +1670,203 @@ def clean_brands():
|
|
|
for _name in list_illegal:
|
|
|
f.write("%s\n"%(_name))
|
|
|
|
|
|
+def merge_brands():
|
|
|
+ from queue import Queue as TQueue
|
|
|
+ import pandas as pd
|
|
|
+ task_queue = TQueue()
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+
|
|
|
+ list_data = []
|
|
|
+
|
|
|
+ table_name = Document_product_dict_interface_table_name
|
|
|
+ table_index = table_name+"_index"
|
|
|
+
|
|
|
+ columns=[DOCUMENT_PRODUCT_DICT_INTERFACE_NAME,DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE,DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS]
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_BASE),
|
|
|
+ RangeQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE,4,4,True,True),
|
|
|
+ ])
|
|
|
+
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED)]),limit=100,get_total_count=True),
|
|
|
+ columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+ for _d in list_dict:
|
|
|
+ list_data.append(_d)
|
|
|
+
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
+ columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+ for _d in list_dict:
|
|
|
+ list_data.append(_d)
|
|
|
+ # if len(list_data)>=1000:
|
|
|
+ # break
|
|
|
+ log("product_dict embedding total_count:%d"%total_count)
|
|
|
+
|
|
|
+ set_key = set()
|
|
|
+ list_process_data = []
|
|
|
+ set_brand = set()
|
|
|
+ for _d in list_data:
|
|
|
+ name = _d.get(DOCUMENT_PRODUCT_DICT_NAME)
|
|
|
+ grade = _d.get(DOCUMENT_PRODUCT_DICT_GRADE)
|
|
|
+ _key = "%s-%d"%(name,grade)
|
|
|
+ if _key in set_key:
|
|
|
+ continue
|
|
|
+ set_key.add(_key)
|
|
|
+ task_queue.put(_d)
|
|
|
+ list_process_data.append(_d)
|
|
|
+ if grade==BRAND_GRADE:
|
|
|
+ set_brand.add(name)
|
|
|
+
|
|
|
+ area_set = get_area_set()
|
|
|
+ def _handle(item,result_queue):
|
|
|
+ name = item.get(DOCUMENT_PRODUCT_DICT_NAME)
|
|
|
+ grade = item.get(DOCUMENT_PRODUCT_DICT_GRADE)
|
|
|
+ for i in range(min(len(name)-2,8)):
|
|
|
+ _n = name[:i+1]
|
|
|
+ if _n in area_set:
|
|
|
+ n_name = re.sub("^[省市区]]",'',name[i+1:])
|
|
|
+ if n_name in set_brand:
|
|
|
+ item["belongs_to"] = n_name
|
|
|
+ standard_alias = item.get(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS)
|
|
|
+ if standard_alias is not None and standard_alias!="":
|
|
|
+ for salias in standard_alias.split("|"):
|
|
|
+ face_id = get_document_product_dict_interface_base_id(salias,grade)
|
|
|
+ _interface_d = {
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ID:face_id,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:randint(401,451)
|
|
|
+ }
|
|
|
+ _dpdi = Document_product_dict_interface(_interface_d)
|
|
|
+ if _dpdi.exists_row(ots_client):
|
|
|
+ _dpdi.update_row(ots_client)
|
|
|
+
|
|
|
+
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,20)
|
|
|
+ mt.run()
|
|
|
+ dict_belongs_alias = {}
|
|
|
+ for _data in list_process_data:
|
|
|
+ name = _data.get(DOCUMENT_PRODUCT_DICT_NAME)
|
|
|
+ belongs_to = _data.get("belongs_to")
|
|
|
+ if belongs_to is not None:
|
|
|
+ if belongs_to not in dict_belongs_alias:
|
|
|
+ dict_belongs_alias[belongs_to] = []
|
|
|
+ dict_belongs_alias[belongs_to].append(name)
|
|
|
+ df_data = {"brand":[],"standard_alias":[]}
|
|
|
+ for k,v in dict_belongs_alias.items():
|
|
|
+ df_data["brand"].append(k)
|
|
|
+ df_data["standard_alias"].append("|".join(v))
|
|
|
+ df = pd.DataFrame(df_data)
|
|
|
+ df.to_excel("../../merge.xlsx",columns=["brand","standard_alias"])
|
|
|
+
|
|
|
+
|
|
|
+ # grade = 4
|
|
|
+ # ots_client = getConnect_ots()
|
|
|
+ # from uuid import uuid4
|
|
|
+ # for k,v in dict_belongs_alias.items():
|
|
|
+ # name = k
|
|
|
+ # new_standard_alias = "+%s"%("|".join(v))
|
|
|
+ # print(k,new_standard_alias)
|
|
|
+ # _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:name,
|
|
|
+ # DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
|
|
|
+ # DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:grade,
|
|
|
+ # DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
|
|
|
+ # DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"update",
|
|
|
+ # DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS:new_standard_alias,
|
|
|
+ # DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS:"",
|
|
|
+ # DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S")}
|
|
|
+ # dpdi = Document_product_dict_interface(_d)
|
|
|
+ # dpdi.update_row(ots_client)
|
|
|
+
|
|
|
+
|
|
|
+def interface_delete_brands():
|
|
|
+ from uuid import uuid4
|
|
|
+ ots_client=getConnect_ots()
|
|
|
+ list_brand = []
|
|
|
+
|
|
|
+ a = '''
|
|
|
+ 日本
|
|
|
+ '''
|
|
|
+ grade = 4
|
|
|
+
|
|
|
+ for s in re.split("[\n\s,.,。、]",a):
|
|
|
+ s = s.strip()
|
|
|
+ if s=="":
|
|
|
+ continue
|
|
|
+ list_brand.append(s)
|
|
|
+
|
|
|
+ with open("../../test/illegal_brand.txt","r",encoding="utf8") as f:
|
|
|
+ while 1:
|
|
|
+ brand = f.readline()
|
|
|
+ if not brand:
|
|
|
+ break
|
|
|
+ brand = brand.strip()
|
|
|
+ if brand!="":
|
|
|
+ list_brand.append(brand)
|
|
|
+
|
|
|
+ for brand in list_brand:
|
|
|
+ print(brand)
|
|
|
+ _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:brand,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:grade,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"delete",
|
|
|
+ DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
|
|
|
+ "is_temp":1
|
|
|
+ }
|
|
|
+ dpdi = Document_product_dict_interface(_d)
|
|
|
+ dpdi.update_row(ots_client)
|
|
|
+
|
|
|
+def clean_interface_delete_temp():
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+
|
|
|
+ table_name = Document_product_dict_interface_table_name
|
|
|
+ table_index = table_name+"_index"
|
|
|
+ columns = ["is_temp","status","name"]
|
|
|
+
|
|
|
+ task_queue = Queue()
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery("action","delete")
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),get_total_count=True,limit=100),
|
|
|
+ columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+ print("%d/%d"%(task_queue.qsize(),total_count))
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
|
|
|
+ SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
|
|
|
+ columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+ print("%d/%d"%(task_queue.qsize(),total_count))
|
|
|
+ def _handle(item,result_queue):
|
|
|
+ is_temp = item.get("is_temp",0)
|
|
|
+ status = item.get("status",0)
|
|
|
+ name = item.get("name")
|
|
|
+ dpdi = Document_product_dict_interface(item)
|
|
|
+ if is_temp==1 and status>=201:
|
|
|
+ dpdi.delete_row(ots_client)
|
|
|
+ else:
|
|
|
+ pass
|
|
|
+ # dpdi.setValue("status",1,True)
|
|
|
+ # dpdi.update_row(ots_client)
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
def clean_product_dict():
|
|
|
ots_client = getConnect_ots()
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
- RangeQuery("status",0)
|
|
|
+ RangeQuery("grade",3)
|
|
|
])
|
|
|
task_queue = Queue()
|
|
|
rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
@@ -1604,11 +1922,53 @@ def clean_product_dict_interface():
|
|
|
mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
mt.run()
|
|
|
|
|
|
+def rerun_interface_deletes():
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+
|
|
|
+ table_name = Document_product_dict_interface_table_name
|
|
|
+ table_index = table_name+"_index"
|
|
|
+ columns = ["is_temp","status","name"]
|
|
|
+
|
|
|
+ task_queue = Queue()
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery("action","delete")
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),get_total_count=True,limit=100),
|
|
|
+ columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+ print("%d/%d"%(task_queue.qsize(),total_count))
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
|
|
|
+ SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
|
|
|
+ columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+ print("%d/%d"%(task_queue.qsize(),total_count))
|
|
|
+ def _handle(item,result_queue):
|
|
|
+ status = item.get("status",0)
|
|
|
+ dpdi = Document_product_dict_interface(item)
|
|
|
+ dpdi.setValue("status",1,True)
|
|
|
+ dpdi.update_row(ots_client)
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
if __name__ == '__main__':
|
|
|
# start_embedding_product_dict()
|
|
|
- # interface_deletes()
|
|
|
+ interface_deletes()
|
|
|
+ interface_insert()
|
|
|
# interface_update()
|
|
|
+ # interface_brand_update_by_file()
|
|
|
# clean_similar()
|
|
|
# clean_brands()
|
|
|
+ # merge_brands()
|
|
|
+ # interface_delete_brands()
|
|
|
+ # clean_interface_delete_temp()
|
|
|
# clean_product_dict()
|
|
|
- clean_product_dict_interface()
|
|
|
+ # clean_product_dict_interface()
|
|
|
+ # rerun_interface_deletes()
|