|
@@ -30,12 +30,13 @@ import logging
|
|
|
root = logging.getLogger()
|
|
|
root.setLevel(logging.INFO)
|
|
|
from uuid import uuid4
|
|
|
+from multiprocessing import Queue as PQueue
|
|
|
|
|
|
class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
def __init__(self):
|
|
|
super(Product_Manager, self).__init__()
|
|
|
- self.process_queue = Queue()
|
|
|
+ self.process_queue = PQueue()
|
|
|
self.ots_client = getConnect_ots()
|
|
|
|
|
|
self.set_id = set()
|
|
@@ -68,6 +69,7 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
|
|
|
list_data = getRow_ots(rows)
|
|
|
_count = len(list_data)
|
|
|
+ log("producer %d/%d"%(q_size,total_count))
|
|
|
list_id = []
|
|
|
for _d in list_data:
|
|
|
_id = _d.get(DOCUMENT_PRODUCT_TMP_ID)
|
|
@@ -113,7 +115,7 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
|
|
|
|
|
|
- def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]):
|
|
|
+ def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]):
|
|
|
'''
|
|
|
Standardizes the product data
|
|
|
通过匹配标准参数表进行标准化,匹配是非精确匹配,校验规则是?
|
|
@@ -140,8 +142,15 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
specs = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,"")
|
|
|
parameters = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_PARAMETER,"")
|
|
|
|
|
|
+
|
|
|
+ original_name = name
|
|
|
+ original_brand = brand
|
|
|
+ original_specs = specs
|
|
|
+
|
|
|
list_candidates = [a for a in [name,brand,specs,parameters] if a!=""]
|
|
|
|
|
|
+ list_candidate_brand_specs = [a for a in [brand,specs,parameters,name] if a!=""]
|
|
|
+
|
|
|
if brand=="" and parameters!="":
|
|
|
brand = parameters
|
|
|
if specs=="" and parameters!="":
|
|
@@ -165,12 +174,14 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
ots_name = _search.get("ots_name")
|
|
|
standard_name = _search.get("standard_name")
|
|
|
ots_parent_id = _search.get("ots_parent_id")
|
|
|
+ remove_words = _search.get("remove_words")
|
|
|
|
|
|
- if is_similar(name,ots_name) or check_product(name,ots_name):
|
|
|
+ if check_product(name,ots_name,remove_words):
|
|
|
name_ots_id = ots_id
|
|
|
+ original_name = name
|
|
|
new_name = standard_name
|
|
|
|
|
|
- log("checking name %s succeed %s"%(name,ots_name))
|
|
|
+ log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
|
|
|
# #update alias of name
|
|
|
# _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
|
|
|
# _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
|
|
@@ -187,11 +198,13 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
ots_name = _search.get("ots_name")
|
|
|
standard_name = _search.get("standard_name")
|
|
|
ots_parent_id = _search.get("ots_parent_id")
|
|
|
+ remove_words = _search.get("remove_words")
|
|
|
|
|
|
- if is_similar(name,ots_name,_radio=95):
|
|
|
+ if check_product(name,ots_name,remove_words):
|
|
|
|
|
|
- log("checking name %s succeed %s"%(name,ots_name))
|
|
|
+ log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
|
|
|
name_ots_id = ots_id
|
|
|
+ original_name = name
|
|
|
new_name = standard_name
|
|
|
|
|
|
# #update alias of name
|
|
@@ -206,9 +219,6 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
s_brand = brand
|
|
|
l_brand = [brand]
|
|
|
- l_brand.append(clean_product_brand(s_brand))
|
|
|
- brand_ch = get_chinese_string(brand)
|
|
|
- l_brand.extend(brand_ch)
|
|
|
|
|
|
Coll,_ = self.get_collection(BRAND_GRADE)
|
|
|
|
|
@@ -223,14 +233,16 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
ots_name = _search.get("ots_name")
|
|
|
standard_name = _search.get("standard_name")
|
|
|
ots_parent_id = _search.get("ots_parent_id")
|
|
|
+ remove_words = _search.get("remove_words")
|
|
|
|
|
|
# log("check brand %s and %s"%(brand,ots_name))
|
|
|
- if is_similar(brand,ots_name) or check_brand(brand,ots_name):
|
|
|
+ if check_brand(brand,ots_name,remove_words):
|
|
|
|
|
|
# log("check brand similar succeed:%s and %s"%(brand,ots_name))
|
|
|
|
|
|
if ots_name==new_name:
|
|
|
continue
|
|
|
+ original_brand = brand
|
|
|
new_brand = standard_name
|
|
|
|
|
|
log("checking brand %s succeed %s"%(brand,new_brand))
|
|
@@ -277,6 +289,7 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
new_brand = clean_product_brand(brand)
|
|
|
if new_brand=="":
|
|
|
continue
|
|
|
+ original_brand = brand
|
|
|
log("adding new brand %s"%(str(new_brand)))
|
|
|
_d_brand = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
|
|
|
DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_brand,
|
|
@@ -299,9 +312,6 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
if _find:
|
|
|
break
|
|
|
l_brand = [brand]
|
|
|
- l_brand.append(clean_product_brand(brand))
|
|
|
- brand_ch = get_chinese_string(brand)
|
|
|
- l_brand.extend(brand_ch)
|
|
|
|
|
|
for brand in l_brand:
|
|
|
if _find:
|
|
@@ -315,12 +325,14 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
ots_name = _search.get("ots_name")
|
|
|
standard_name = _search.get("standard_name")
|
|
|
ots_parent_id = _search.get("ots_parent_id")
|
|
|
+ remove_words = _search.get("remove_words")
|
|
|
|
|
|
# log("check brand %s and %s"%(brand,ots_name))
|
|
|
- if check_brand(brand,ots_name):
|
|
|
+ if check_brand(brand,ots_name,remove_words):
|
|
|
# log("check brand similar succeed:%s and %s"%(brand,ots_name))
|
|
|
if ots_name==new_name:
|
|
|
continue
|
|
|
+ orignal_brand = brand
|
|
|
new_brand = standard_name
|
|
|
|
|
|
log("checking brand %s succeed %s"%(brand,new_brand))
|
|
@@ -392,6 +404,7 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
# log("specs is_similar")
|
|
|
if check_specs(c_specs,ots_name):
|
|
|
break_flag = True
|
|
|
+ original_specs = c_specs
|
|
|
new_specs = standard_name
|
|
|
log("check_specs %s succeed %s"%(specs,new_specs))
|
|
|
|
|
@@ -435,6 +448,7 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
for specs in list_similar_specs:
|
|
|
if is_legal_specs(specs) and len(specs)<MAX_NAME_LENGTH and len(specs)>=5:
|
|
|
debug("is_legal_specs")
|
|
|
+ original_specs = specs
|
|
|
new_specs = clean_product_specs(specs)
|
|
|
# insert into document_product_dict a new record
|
|
|
# to update the document_product_dict which is builded for search
|
|
@@ -471,7 +485,7 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
break
|
|
|
if specs_ots_id is None:
|
|
|
_find = False
|
|
|
- for specs in list_candidates:
|
|
|
+ for specs in list_candidate_brand_specs:
|
|
|
if _find:
|
|
|
break
|
|
|
|
|
@@ -505,10 +519,11 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
ots_parent_id = _search.get("ots_parent_id")
|
|
|
|
|
|
debug("checking specs %s and %s"%(specs,ots_name))
|
|
|
- if is_similar(specs,ots_name):
|
|
|
+ if is_similar(c_specs,ots_name):
|
|
|
# log("specs is_similar")
|
|
|
if check_specs(c_specs,ots_name):
|
|
|
break_flag = True
|
|
|
+ original_specs = c_specs
|
|
|
new_specs = standard_name
|
|
|
if brand_ots_id is not None:
|
|
|
# judge if the specs which parent_id is brand_ots_id exists,insert one if not exists else update alias
|
|
@@ -610,9 +625,9 @@ class Product_Manager(Product_Dict_Manager):
|
|
|
|
|
|
_product.setValue(DOCUMENT_PRODUCT_CREATE_TIME,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
|
|
|
|
|
|
- _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,""),True)
|
|
|
- _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,""),True)
|
|
|
- _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,""),True)
|
|
|
+ _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,original_name,True)
|
|
|
+ _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,original_brand,True)
|
|
|
+ _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,original_specs,True)
|
|
|
|
|
|
bid_filemd5s = self.get_bid_filemd5s(docid,self.ots_client)
|
|
|
if bid_filemd5s is not None:
|
|
@@ -852,13 +867,14 @@ def fix_product_data():
|
|
|
# table_name = "document_product"
|
|
|
# table_index = "document_product_index"
|
|
|
|
|
|
- columns = [DOCUMENT_PRODUCT_TMP_NEW_ID]
|
|
|
+ columns = [DOCUMENT_PRODUCT_TMP_NEW_ID,DOCUMENT_PRODUCT_TMP_STATUS]
|
|
|
ots_client = getConnect_ots()
|
|
|
- bool_query = BoolQuery(must_queries=[
|
|
|
+ bool_query = BoolQuery(should_queries=[
|
|
|
# RangeQuery("status",501),
|
|
|
# TermQuery("docid",246032980)
|
|
|
|
|
|
- RangeQuery("status",201,301)
|
|
|
+ RangeQuery("status",201,301),
|
|
|
+ RangeQuery("status",401,451)
|
|
|
# WildcardQuery(DOCUMENT_PRODUCT_ORIGINAL_SPECS,"MFUSOne")
|
|
|
# TermQuery(DOCUMENT_PRODUCT_SPECS,"MFUSOne")
|
|
|
])
|
|
@@ -923,9 +939,10 @@ def fix_product_data():
|
|
|
dpt.update_row(ots_client)
|
|
|
|
|
|
new_id = item.get(DOCUMENT_PRODUCT_TMP_NEW_ID)
|
|
|
- _d = {DOCUMENT_PRODUCT_ID:new_id}
|
|
|
- dp = Document_product(_d)
|
|
|
- dp.delete_row(ots_client)
|
|
|
+ if new_id is not None and new_id!="":
|
|
|
+ _d = {DOCUMENT_PRODUCT_ID:new_id}
|
|
|
+ dp = Document_product(_d)
|
|
|
+ dp.delete_row(ots_client)
|
|
|
|
|
|
def handle(item,result_queue):
|
|
|
win_bid_price = item.get(DOCUMENT_PRODUCT_TMP_WIN_BID_PRICE,1)
|
|
@@ -989,7 +1006,7 @@ def test_check_brand():
|
|
|
f.write(b+"\n")
|
|
|
|
|
|
def test_match():
|
|
|
- a = "数字化医用X射线摄影系统(DR)"
|
|
|
+ a = "兽医设备"
|
|
|
|
|
|
|
|
|
# vector = request_embedding(get_milvus_standard_name(a))
|
|
@@ -999,7 +1016,7 @@ def test_match():
|
|
|
Coll,_ = pm.get_collection(_GRADE)
|
|
|
print(Coll.name)
|
|
|
|
|
|
- output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]
|
|
|
+ output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]
|
|
|
# start_time = time.time()
|
|
|
# print(Coll.query(expr=" ots_id in ['75058b275a4c1d8ee38b58c5c5cce3bb'] ",output_fields=output_fields))
|
|
|
# print("cost",time.time()-start_time)
|
|
@@ -1257,19 +1274,57 @@ def delete_specs():
|
|
|
mt.run()
|
|
|
Coll.compact()
|
|
|
|
|
|
+def remove_redis_keys():
|
|
|
+ db = redis.Redis(connection_pool=pool_product)
|
|
|
+ db.flushdb()
|
|
|
+
|
|
|
+
|
|
|
+def update_document_product_dict():
|
|
|
+ import pandas as pd
|
|
|
+ filename = "update_product.csv"
|
|
|
+ df = pd.read_csv(filename,encoding="gbk")
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ for name,grade,standard_alias,remove_words,level in zip(df["name"],df["grade"],df["standard_alias"],df["remove_words"],df["level"]):
|
|
|
+ name = name.strip()
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_NAME,name),
|
|
|
+ TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,grade)
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
|
|
|
+ SearchQuery(bool_query,get_total_count=True),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ if total_count==1:
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ _data = list_data[0]
|
|
|
+ dpd = Document_product_dict(_data)
|
|
|
+ level = 1
|
|
|
+ if re.search("器械|设备|其他",name) is not None and level==1:
|
|
|
+ level = 2
|
|
|
+ if str(remove_words)=="nan":
|
|
|
+ remove_words = ""
|
|
|
+ dpd.setValue(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,standard_alias,True)
|
|
|
+ dpd.setValue(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,remove_words,True)
|
|
|
+ dpd.setValue(DOCUMENT_PRODUCT_DICT_LEVEL,level,True)
|
|
|
+ dpd.setValue(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED,IS_SYNCHONIZED+1,True)
|
|
|
+ dpd.update_row(ots_client)
|
|
|
+ print(dpd.getProperties())
|
|
|
+
|
|
|
|
|
|
|
|
|
def test():
|
|
|
# pm = Product_Manager()
|
|
|
# pm.test()
|
|
|
- # fix_product_data()
|
|
|
+ fix_product_data()
|
|
|
# test_check_brand()
|
|
|
# test_match()
|
|
|
- rebuild_milvus()
|
|
|
+ # rebuild_milvus()
|
|
|
|
|
|
# move_document_product()
|
|
|
# delete_brands()
|
|
|
# delete_specs()
|
|
|
+ # remove_redis_keys()
|
|
|
+ # update_document_product_dict()
|
|
|
+
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|