|
@@ -65,7 +65,9 @@ class Product_Dict_Manager():
|
|
|
FieldSchema(name="standard_name_id",dtype=DataType.VARCHAR,max_length=32),
|
|
|
FieldSchema(name="embedding",dtype=DataType.FLOAT_VECTOR,dim=1024),
|
|
|
FieldSchema(name="ots_parent_id",dtype=DataType.VARCHAR,max_length=32),
|
|
|
- FieldSchema(name="ots_grade",dtype=DataType.INT64)
|
|
|
+ FieldSchema(name="ots_grade",dtype=DataType.INT64),
|
|
|
+ FieldSchema(name="remove_words",dtype=DataType.VARCHAR,max_length=3000),
|
|
|
+ FieldSchema(name="level",dtype=DataType.INT64),
|
|
|
]
|
|
|
|
|
|
index_name = "embedding"
|
|
@@ -826,23 +828,29 @@ def clean_similar():
|
|
|
|
|
|
|
|
|
|
|
|
-def insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias):
|
|
|
+def insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_words="",level=1):
|
|
|
|
|
|
n_name = get_milvus_standard_name(name)
|
|
|
name_id = get_milvus_product_dict_id(n_name)
|
|
|
|
|
|
+
|
|
|
vector = request_embedding(n_name)
|
|
|
|
|
|
log("insert name %s grade %d"%(name,grade))
|
|
|
if vector is not None and Coll is not None:
|
|
|
|
|
|
+ expr = " ots_id in ['%s']"%name_id
|
|
|
+ Coll.delete(expr)
|
|
|
data = [[name_id],
|
|
|
[name],
|
|
|
[name],
|
|
|
[name_id],
|
|
|
[vector],
|
|
|
[parent_id],
|
|
|
- [grade]]
|
|
|
+ [grade],
|
|
|
+ [remove_words],
|
|
|
+ [level]
|
|
|
+ ]
|
|
|
insert_embedding(Coll,data)
|
|
|
|
|
|
if standard_alias is not None and standard_alias!="":
|
|
@@ -854,6 +862,9 @@ def insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias):
|
|
|
if _alias==name:
|
|
|
continue
|
|
|
_id = get_document_product_dict_standard_alias_id(_alias)
|
|
|
+
|
|
|
+ expr = " ots_id in ['%s']"%_id
|
|
|
+ Coll.delete(expr)
|
|
|
n_alias = get_milvus_standard_name(_alias)
|
|
|
vector = request_embedding(n_alias)
|
|
|
data = [[_id],
|
|
@@ -862,7 +873,10 @@ def insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias):
|
|
|
[name_id],
|
|
|
[vector],
|
|
|
[parent_id],
|
|
|
- [grade]]
|
|
|
+ [grade],
|
|
|
+ [remove_words],
|
|
|
+ [level]
|
|
|
+ ]
|
|
|
insert_embedding(Coll,data)
|
|
|
return True
|
|
|
|
|
@@ -916,8 +930,85 @@ def interface_deletes():
|
|
|
print(s)
|
|
|
dict_interface_delete(s,grade,ots_client)
|
|
|
|
|
|
+def clean_brands():
|
|
|
+ from queue import Queue as TQueue
|
|
|
+ task_queue = TQueue()
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+
|
|
|
+ list_data = []
|
|
|
+
|
|
|
+ columns=[DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_PARENT_ID,DOCUMENT_PRODUCT_DICT_GRADE]
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ RangeQuery(DOCUMENT_PRODUCT_DICT_GRADE,4,4,True,True),
|
|
|
+ ])
|
|
|
+
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED)]),limit=100,get_total_count=True),
|
|
|
+ columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+ for _d in list_dict:
|
|
|
+ list_data.append(_d)
|
|
|
+
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
+ columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+ for _d in list_dict:
|
|
|
+ list_data.append(_d)
|
|
|
+ # if len(list_data)>=1000:
|
|
|
+ # break
|
|
|
+ log("product_dict embedding total_count:%d"%total_count)
|
|
|
+
|
|
|
+ set_key = set()
|
|
|
+ list_process_data = []
|
|
|
+ for _d in list_data:
|
|
|
+ name = _d.get(DOCUMENT_PRODUCT_DICT_NAME)
|
|
|
+ grade = _d.get(DOCUMENT_PRODUCT_DICT_GRADE)
|
|
|
+ _key = "%s-%d"%(name,grade)
|
|
|
+ if _key in set_key:
|
|
|
+ continue
|
|
|
+ set_key.add(_key)
|
|
|
+ task_queue.put(_d)
|
|
|
+ list_process_data.append(_d)
|
|
|
+ def _handle(item,result_queue):
|
|
|
+ name = item.get(DOCUMENT_PRODUCT_DICT_NAME)
|
|
|
+
|
|
|
+ if is_legal_brand(ots_client,name):
|
|
|
+ item["legal"] = 1
|
|
|
+ else:
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery("brand",name)
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document_product","document_product_index",
|
|
|
+ SearchQuery(bool_query,get_total_count=True))
|
|
|
+ if total_count>0:
|
|
|
+ item["legal"] = 1
|
|
|
+ else:
|
|
|
+ item["legal"] = 0
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,30)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+ list_legal = []
|
|
|
+ list_illegal = []
|
|
|
+ for _data in list_process_data:
|
|
|
+ name = _data.get(DOCUMENT_PRODUCT_DICT_NAME)
|
|
|
+ legal = _data["legal"]
|
|
|
+ if legal==1:
|
|
|
+ list_legal.append(name)
|
|
|
+ else:
|
|
|
+ list_illegal.append(name)
|
|
|
+ with open("../../test/legal_brand.txt", "w", encoding="utf8") as f:
|
|
|
+ for _name in list_legal:
|
|
|
+ f.write("%s\n"%(_name))
|
|
|
+ with open("../../test/illegal_brand.txt", "w", encoding="utf8") as f:
|
|
|
+ for _name in list_illegal:
|
|
|
+ f.write("%s\n"%(_name))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
# start_embedding_product_dict()
|
|
|
# interface_deletes()
|
|
|
- clean_similar()
|
|
|
+ # clean_similar()
|
|
|
+ clean_brands()
|