Prechádzať zdrojové kódy

收紧品牌的匹配规则,修改original值,便于调试

luojiehua 1 rok pred
rodič
commit
14b9e7ca57

+ 3 - 0
.idea/encodings.xml

@@ -4,6 +4,9 @@
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/attachmentProcessTime.xlsx" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/dataSource/searchPaddle.py" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/attachment/2022-01-18_183521_export11.xlsx" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/select_product_exclude_name_from_tw_prod.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/select_product_product_name_exclude_name.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/update_product.csv" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/model/ots/2022-01-19_214304_export11.xlsx" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/model/ots/2022-10-14_190838_数据导出.xlsx" charset="GBK" />
     <file url="file://$PROJECT_DIR$/attachmentProcessTime2.xlsx" charset="GBK" />

+ 3 - 3
BaseDataMaintenance/maintenance/dataflow.py

@@ -3888,15 +3888,15 @@ class Dataflow_dumplicate(Dataflow):
 
 
             _time = time.time()
-            log("%d start final check with length:%d"%(item["docid"],len(base_list)))
+            # log("%d start final check with length:%d"%(item["docid"],len(base_list)))
             final_list = self.dumplicate_fianl_check(base_list)
 
             exist_finterprint = self.is_exist_fingerprint(final_list,item.get(document_tmp_docid),item.get(document_tmp_fingerprint),table_name)
-            log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
+            # log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
             best_docid = self.get_best_docid(final_list)
 
             final_list_docid = [a["docid"] for a in final_list]
-            log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
+            # log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
             _d = {"partitionkey":item["partitionkey"],
                   "docid":item["docid"],
                   "status":random.randint(*flow_dumplicate_status_to),

+ 6 - 1
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -763,7 +763,12 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                 try:
                     _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
                     _soup = BeautifulSoup(_dochtmlcon,"lxml")
-                    _soup = article_limit(_soup,50000)
+                    if len(_dochtmlcon)>200000:
+                        _find = _soup.find("div",attrs={"class":"richTextFetch"})
+                        _find.decompose()
+                    else:
+                        _soup = BeautifulSoup(_dochtmlcon,"lxml")
+                        _soup = article_limit(_soup,50000)
                     _dochtmlcon = str(_soup)
                 except Exception as e:
                     traceback.print_exc()

+ 52 - 23
BaseDataMaintenance/maintenance/product/productUtils.py

@@ -25,7 +25,7 @@ from BaseDataMaintenance.model.ots.enterprise import *
 from BaseDataMaintenance.maintenance.product.make_brand_pattern import get_area_set
 
 area_set = get_area_set()
-
+import jieba
 
 def get_intellect_search(coll,index_name,name,grade,search_params,output_fields,limit,max_steps=5):
 
@@ -34,17 +34,22 @@ def get_intellect_search(coll,index_name,name,grade,search_params,output_fields,
     if v is not None:
         vector.append(v)
     if len(str(name))>=5:
-        strides = [3,6]
+        name_cut = list(jieba.cut(name))
+        strides = [1,2]
         for stride in strides:
-            steps = len(name)//stride
+            steps = len(name_cut)//stride
             if len(name)%stride>=stride//2+1:
                 steps += 1
             _begin = 0
+            _name = ""
             for i in range(min(steps,max_steps)):
-                _name = str(name)[i*stride:(i+1)*stride+2]
+                _name += "".join(name_cut[i*stride:(i+1)*stride])
+                if len(_name)<2:
+                    continue
                 v = get_embedding_request(_name)
                 if v is not None:
                     vector.append(v)
+                _name = ""
 
     if len(vector)>0:
         list_search = get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit)
@@ -67,10 +72,10 @@ def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_
         except Exception as e:
             log("get redis data error")
         if _search_list is not None:
-            log("_search_list is not None")
+            # log("_search_list is not None")
             return json.loads(_search_list)
         else:
-            log("search from milvus")
+            # log("search from milvus")
             list_result = []
             result = coll.search(vector,index_name,search_params,top_k=limit,output_fields=output_fields,limit=limit)
             for hits in result:
@@ -84,6 +89,9 @@ def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_
                     _d[k] = _search.entity.get(k)
                 final_list.append(_d)
             final_list = remove_repeat_item(final_list,k="ots_name")
+            for _d in final_list:
+                _d["length_dis"] = abs(len(_d.get("standard_name",""))-len(name))
+            final_list.sort(key=lambda x:x.get("length_dis",0))
             final_list.sort(key=lambda x:x.get("level",1))
             try:
                 db.set(_md5,json.dumps(final_list))
@@ -129,6 +137,7 @@ def get_embedding_request(sentence,retry_times=3):
             if _embedding is not None:
                 try:
                     db.set(_md5,json.dumps(_embedding))
+                    db.expire(_md5,60*60)
                 except Exception as e:
                     log("set redis data error")
             return _embedding
@@ -230,10 +239,6 @@ def is_similar(source,target,_radio=None):
                 return True
         elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
             return True
-    # 全中文判断是否包含
-    if len(source)==max_len and judge_pur_chinese(target):
-        if str(source).find(target)>=0:
-            return True
     return False
 
 
@@ -254,7 +259,15 @@ def check_char(source,target,chat_pattern=re.compile("^[a-zA-Z0-9]+$"),find_patt
         else:
             return False
 
-def check_product(source,target):
+def check_product(source,target,remove_words):
+
+    if remove_words is not None and remove_words!="":
+        _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+        list_split = [a.strip() for a in _split if a.strip()!=""]
+        for _s in list_split:
+            if str(source).find(_s)>=0:
+                return False
+
     _check = check_char(source,target)
     if _check:
         return True
@@ -262,36 +275,48 @@ def check_product(source,target):
         if _check==False:
             return False
 
-    if is_contain(source,target,min_len=2):
+    if len(source)>len(target) and target in source:
         return True
+
     max_len = max(len(source),len(target))
     min_len = min(len(source),len(target))
-    min_ratio = 92
+
     if min_len<2:
         return False
     elif max_len<=5:
-        min_ratio=94
+        min_ratio=96
     else:
-        min_ratio = 90
+        min_ratio = 95
+    min_ratio = 98
     if is_similar(source,target,min_ratio):
         return True
     return False
 
 
-def check_brand(source,target):
+def check_brand(source,target,remove_words):
+
+
     source = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(source).lower())
     target = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(target).lower())
 
+    if remove_words is not None and remove_words!="":
+        _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+        list_split = [a.strip() for a in _split if a.strip()!=""]
+        for _s in _split:
+            if str(source).find(_s)>=0:
+                return False
+
     max_len = max(len(source),len(target))
     min_len = min(len(source),len(target))
 
-    min_ratio = 92
+
     if min_len<2:
         return False
     elif max_len<=5:
         min_ratio=94
     else:
         min_ratio = 90
+    min_ratio = 98
 
     source_c = "".join(get_chinese_string(source))
     target_c = "".join(get_chinese_string(target))
@@ -305,17 +330,20 @@ def check_brand(source,target):
 
     if len(source_c)>=2 and len(target_c)>=2:
         if not(source_c in area_set or target_c in area_set):
-            if is_similar(source_c,target_c,min_ratio):
+            if is_contain(source_c,target_c):
                 return True
 
-            if is_contain(source_c,target_c):
+            if is_similar(source_c,target_c,min_ratio):
                 return True
+
     if has_same_specs_count(source,target):
-        if is_similar(source,target,min_ratio):
-            return True
 
         if is_contain(source,target):
             return True
+        if is_similar(source,target,min_ratio):
+            return True
+
+
 
 SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789.']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
 NOT_SPECS_PATTERN = re.compile("[^%s]"%("".join(list(SPECS_CHECK_SET))))
@@ -533,10 +561,11 @@ def clean_product_quantity(product_quantity):
 
 if __name__ == '__main__':
     # print(check_brand('杭州郎基','杭州利华'))
-    print(check_product("数字化医用X射线摄影系统(DR)","DR"))
+    # print(check_product("医用冷藏箱","医用","a|"))
+
     # print(re.split("[^\u4e00-\u9fff]",'128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10'))
     # import Levenshtein
     # print(Levenshtein.ratio('助听器','助行器'))
     # print(clean_product_specs("//4008SverssionV10"))
-    # print(is_legal_brand(getConnect_ots(),"康复"))
+    print(is_legal_brand(getConnect_ots(),"医用外科口罩"))
     # print(check_specs("500ml","3500ml"))

+ 10 - 6
BaseDataMaintenance/maintenance/product/product_dict.py

@@ -101,7 +101,7 @@ class Product_Dict_Manager():
 
 
 
-    def embedding_producer(self,columns=[DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_PARENT_ID,DOCUMENT_PRODUCT_DICT_GRADE,DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS]):
+    def embedding_producer(self,columns=[DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_PARENT_ID,DOCUMENT_PRODUCT_DICT_GRADE,DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,DOCUMENT_PRODUCT_DICT_LEVEL]):
 
         bool_query = BoolQuery(
             must_queries=[RangeQuery(DOCUMENT_PRODUCT_DICT_GRADE,3,5,True,True)],
@@ -131,16 +131,20 @@ class Product_Dict_Manager():
     def embedding_comsumer(self):
         def handle(item,result_queue):
             try:
+                _id = item.get(DOCUMENT_PRODUCT_DICT_ID)
                 name = str(item.get(DOCUMENT_PRODUCT_DICT_NAME))[:MAX_NAME_LENGTH]
 
                 parent_id = item.get(DOCUMENT_PRODUCT_DICT_PARENT_ID)
                 grade = item.get(DOCUMENT_PRODUCT_DICT_GRADE)
                 Coll,_ = self.get_collection(grade)
                 standard_alias = item.get(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,"")
+                remove_words = item.get(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,"")
+                level = item.get(DOCUMENT_PRODUCT_DICT_LEVEL,1)
 
-                if insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias):
 
-                    _pd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:id,DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED})
+                if insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_words,level):
+
+                    _pd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:_id,DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED})
                     _pd.update_row(self.ots_client)
 
 
@@ -919,7 +923,7 @@ def dict_interface_delete(name,grade,ots_client = getConnect_ots()):
 
 def interface_deletes():
     a = '''
-    眼科
+    明细
     '''
     grade = 4
     ots_client=getConnect_ots()
@@ -1009,6 +1013,6 @@ def clean_brands():
 
 if __name__ == '__main__':
     # start_embedding_product_dict()
-    # interface_deletes()
+    interface_deletes()
     # clean_similar()
-    clean_brands()
+    # clean_brands()

+ 84 - 29
BaseDataMaintenance/maintenance/product/products.py

@@ -30,12 +30,13 @@ import logging
 root = logging.getLogger()
 root.setLevel(logging.INFO)
 from uuid import uuid4
+from multiprocessing import Queue as PQueue
 
 class Product_Manager(Product_Dict_Manager):
 
     def __init__(self):
         super(Product_Manager, self).__init__()
-        self.process_queue = Queue()
+        self.process_queue = PQueue()
         self.ots_client = getConnect_ots()
 
         self.set_id = set()
@@ -68,6 +69,7 @@ class Product_Manager(Product_Dict_Manager):
                                                                             columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
         list_data = getRow_ots(rows)
         _count = len(list_data)
+        log("producer %d/%d"%(q_size,total_count))
         list_id = []
         for _d in list_data:
             _id = _d.get(DOCUMENT_PRODUCT_TMP_ID)
@@ -113,7 +115,7 @@ class Product_Manager(Product_Dict_Manager):
 
 
 
-    def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]):
+    def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]):
         '''
         Standardizes the product data
         通过匹配标准参数表进行标准化,匹配是非精确匹配,校验规则是?
@@ -140,8 +142,15 @@ class Product_Manager(Product_Dict_Manager):
         specs = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,"")
         parameters = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_PARAMETER,"")
 
+
+        original_name = name
+        original_brand = brand
+        original_specs = specs
+
         list_candidates = [a for a in [name,brand,specs,parameters] if a!=""]
 
+        list_candidate_brand_specs = [a for a in [brand,specs,parameters,name] if a!=""]
+
         if brand=="" and parameters!="":
             brand = parameters
         if specs=="" and parameters!="":
@@ -165,12 +174,14 @@ class Product_Manager(Product_Dict_Manager):
                 ots_name = _search.get("ots_name")
                 standard_name = _search.get("standard_name")
                 ots_parent_id = _search.get("ots_parent_id")
+                remove_words = _search.get("remove_words")
 
-                if is_similar(name,ots_name) or check_product(name,ots_name):
+                if check_product(name,ots_name,remove_words):
                     name_ots_id = ots_id
+                    original_name = name
                     new_name = standard_name
 
-                    log("checking name %s succeed %s"%(name,ots_name))
+                    log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
                     # #update alias of name
                     # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
                     # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
@@ -187,11 +198,13 @@ class Product_Manager(Product_Dict_Manager):
                     ots_name = _search.get("ots_name")
                     standard_name = _search.get("standard_name")
                     ots_parent_id = _search.get("ots_parent_id")
+                    remove_words = _search.get("remove_words")
 
-                    if is_similar(name,ots_name,_radio=95):
+                    if check_product(name,ots_name,remove_words):
 
-                        log("checking name %s succeed %s"%(name,ots_name))
+                        log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
                         name_ots_id = ots_id
+                        original_name = name
                         new_name = standard_name
 
                         # #update alias of name
@@ -206,9 +219,6 @@ class Product_Manager(Product_Dict_Manager):
 
                 s_brand = brand
                 l_brand = [brand]
-                l_brand.append(clean_product_brand(s_brand))
-                brand_ch = get_chinese_string(brand)
-                l_brand.extend(brand_ch)
 
                 Coll,_ = self.get_collection(BRAND_GRADE)
 
@@ -223,14 +233,16 @@ class Product_Manager(Product_Dict_Manager):
                         ots_name = _search.get("ots_name")
                         standard_name = _search.get("standard_name")
                         ots_parent_id = _search.get("ots_parent_id")
+                        remove_words = _search.get("remove_words")
 
                         # log("check brand %s and %s"%(brand,ots_name))
-                        if is_similar(brand,ots_name) or check_brand(brand,ots_name):
+                        if check_brand(brand,ots_name,remove_words):
 
                             # log("check brand similar succeed:%s and %s"%(brand,ots_name))
 
                             if ots_name==new_name:
                                 continue
+                            original_brand = brand
                             new_brand = standard_name
 
                             log("checking brand %s succeed %s"%(brand,new_brand))
@@ -277,6 +289,7 @@ class Product_Manager(Product_Dict_Manager):
                             new_brand = clean_product_brand(brand)
                             if new_brand=="":
                                 continue
+                            original_brand = brand
                             log("adding new brand %s"%(str(new_brand)))
                             _d_brand = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
                                         DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_brand,
@@ -299,9 +312,6 @@ class Product_Manager(Product_Dict_Manager):
                     if _find:
                         break
                     l_brand = [brand]
-                    l_brand.append(clean_product_brand(brand))
-                    brand_ch = get_chinese_string(brand)
-                    l_brand.extend(brand_ch)
 
                     for brand in l_brand:
                         if _find:
@@ -315,12 +325,14 @@ class Product_Manager(Product_Dict_Manager):
                             ots_name = _search.get("ots_name")
                             standard_name = _search.get("standard_name")
                             ots_parent_id = _search.get("ots_parent_id")
+                            remove_words = _search.get("remove_words")
 
                             # log("check brand %s and %s"%(brand,ots_name))
-                            if check_brand(brand,ots_name):
+                            if check_brand(brand,ots_name,remove_words):
                                 # log("check brand similar succeed:%s and %s"%(brand,ots_name))
                                 if ots_name==new_name:
                                     continue
+                                orignal_brand = brand
                                 new_brand = standard_name
 
                                 log("checking brand %s succeed %s"%(brand,new_brand))
@@ -392,6 +404,7 @@ class Product_Manager(Product_Dict_Manager):
                                 # log("specs is_similar")
                                 if check_specs(c_specs,ots_name):
                                     break_flag = True
+                                    original_specs = c_specs
                                     new_specs = standard_name
                                     log("check_specs %s succeed %s"%(specs,new_specs))
 
@@ -435,6 +448,7 @@ class Product_Manager(Product_Dict_Manager):
                     for specs in list_similar_specs:
                         if is_legal_specs(specs) and len(specs)<MAX_NAME_LENGTH and len(specs)>=5:
                             debug("is_legal_specs")
+                            original_specs = specs
                             new_specs = clean_product_specs(specs)
                             # insert into document_product_dict a new record
                             # to update the document_product_dict which is builded for search
@@ -471,7 +485,7 @@ class Product_Manager(Product_Dict_Manager):
                             break
         if specs_ots_id is None:
             _find = False
-            for specs in list_candidates:
+            for specs in list_candidate_brand_specs:
                 if _find:
                     break
 
@@ -505,10 +519,11 @@ class Product_Manager(Product_Dict_Manager):
                             ots_parent_id = _search.get("ots_parent_id")
 
                             debug("checking specs %s and %s"%(specs,ots_name))
-                            if is_similar(specs,ots_name):
+                            if is_similar(c_specs,ots_name):
                                 # log("specs is_similar")
                                 if check_specs(c_specs,ots_name):
                                     break_flag = True
+                                    original_specs = c_specs
                                     new_specs = standard_name
                                     if brand_ots_id is not None:
                                         # judge if the specs which parent_id is brand_ots_id exists,insert one if not exists else update alias
@@ -610,9 +625,9 @@ class Product_Manager(Product_Dict_Manager):
 
             _product.setValue(DOCUMENT_PRODUCT_CREATE_TIME,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
 
-            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,""),True)
-            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,""),True)
-            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,""),True)
+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,original_name,True)
+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,original_brand,True)
+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,original_specs,True)
 
             bid_filemd5s = self.get_bid_filemd5s(docid,self.ots_client)
             if bid_filemd5s is not None:
@@ -852,13 +867,14 @@ def fix_product_data():
     # table_name = "document_product"
     # table_index = "document_product_index"
 
-    columns = [DOCUMENT_PRODUCT_TMP_NEW_ID]
+    columns = [DOCUMENT_PRODUCT_TMP_NEW_ID,DOCUMENT_PRODUCT_TMP_STATUS]
     ots_client = getConnect_ots()
-    bool_query = BoolQuery(must_queries=[
+    bool_query = BoolQuery(should_queries=[
         # RangeQuery("status",501),
         # TermQuery("docid",246032980)
 
-        RangeQuery("status",201,301)
+        RangeQuery("status",201,301),
+        RangeQuery("status",401,451)
         # WildcardQuery(DOCUMENT_PRODUCT_ORIGINAL_SPECS,"MFUSOne")
         # TermQuery(DOCUMENT_PRODUCT_SPECS,"MFUSOne")
     ])
@@ -923,9 +939,10 @@ def fix_product_data():
         dpt.update_row(ots_client)
 
         new_id = item.get(DOCUMENT_PRODUCT_TMP_NEW_ID)
-        _d = {DOCUMENT_PRODUCT_ID:new_id}
-        dp = Document_product(_d)
-        dp.delete_row(ots_client)
+        if new_id is not None and new_id!="":
+            _d = {DOCUMENT_PRODUCT_ID:new_id}
+            dp = Document_product(_d)
+            dp.delete_row(ots_client)
 
     def handle(item,result_queue):
         win_bid_price = item.get(DOCUMENT_PRODUCT_TMP_WIN_BID_PRICE,1)
@@ -989,7 +1006,7 @@ def test_check_brand():
             f.write(b+"\n")
 
 def test_match():
-    a = "数字化医用X射线摄影系统(DR)"
+    a = "兽医设备"
 
 
     # vector = request_embedding(get_milvus_standard_name(a))
@@ -999,7 +1016,7 @@ def test_match():
     Coll,_ = pm.get_collection(_GRADE)
     print(Coll.name)
 
-    output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]
+    output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]
     # start_time = time.time()
     # print(Coll.query(expr=" ots_id in ['75058b275a4c1d8ee38b58c5c5cce3bb'] ",output_fields=output_fields))
     # print("cost",time.time()-start_time)
@@ -1257,19 +1274,57 @@ def delete_specs():
     mt.run()
     Coll.compact()
 
+def remove_redis_keys():
+    db = redis.Redis(connection_pool=pool_product)
+    db.flushdb()
+
+
+def update_document_product_dict():
+    import pandas as pd
+    filename = "update_product.csv"
+    df = pd.read_csv(filename,encoding="gbk")
+    ots_client = getConnect_ots()
+    for name,grade,standard_alias,remove_words,level in zip(df["name"],df["grade"],df["standard_alias"],df["remove_words"],df["level"]):
+        name = name.strip()
+        bool_query = BoolQuery(must_queries=[
+            TermQuery(DOCUMENT_PRODUCT_DICT_NAME,name),
+            TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,grade)
+        ])
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                       SearchQuery(bool_query,get_total_count=True),
+                                                                       ColumnsToGet(return_type=ColumnReturnType.NONE))
+        if total_count==1:
+            list_data = getRow_ots(rows)
+            _data = list_data[0]
+            dpd = Document_product_dict(_data)
+            level = 1
+            if re.search("器械|设备|其他",name) is not None and level==1:
+                level = 2
+            if str(remove_words)=="nan":
+                remove_words = ""
+            dpd.setValue(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,standard_alias,True)
+            dpd.setValue(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,remove_words,True)
+            dpd.setValue(DOCUMENT_PRODUCT_DICT_LEVEL,level,True)
+            dpd.setValue(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED,IS_SYNCHONIZED+1,True)
+            dpd.update_row(ots_client)
+            print(dpd.getProperties())
+
 
 
 def test():
     # pm = Product_Manager()
     # pm.test()
-    # fix_product_data()
+    fix_product_data()
     # test_check_brand()
     # test_match()
-    rebuild_milvus()
+    # rebuild_milvus()
 
     # move_document_product()
     # delete_brands()
     # delete_specs()
+    # remove_redis_keys()
+    # update_document_product_dict()
+
 
 if __name__ == '__main__':