Parcourir la source

完善接口表的操作,更新支撑修改别名,标准别名,排除词和级别

luojiehua il y a 1 an
Parent
commit
44498254fe

+ 1 - 1
BaseDataMaintenance/maintenance/dataflow.py

@@ -4039,7 +4039,7 @@ class Dataflow_dumplicate(Dataflow):
         schedule.add_job(self.flow_dumplicate,"cron",second="*/40")
         schedule.add_job(self.flow_dumpcate_comsumer,"cron",second="*/10")
         schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
-        schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
+        # schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
         schedule.start()
 
     def changeSaveStatus(self,list_dict):

+ 0 - 1
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -767,7 +767,6 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                         _find = _soup.find("div",attrs={"class":"richTextFetch"})
                         _find.decompose()
                     else:
-                        _soup = BeautifulSoup(_dochtmlcon,"lxml")
                         _soup = article_limit(_soup,50000)
                     _dochtmlcon = str(_soup)
                 except Exception as e:

+ 12 - 6
BaseDataMaintenance/maintenance/product/productUtils.py

@@ -90,8 +90,11 @@ def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_
                 final_list.append(_d)
             final_list = remove_repeat_item(final_list,k="ots_name")
             for _d in final_list:
-                _d["length_dis"] = abs(len(_d.get("standard_name",""))-len(name))
-            final_list.sort(key=lambda x:x.get("length_dis",0))
+                # _d["length_dis"] = abs(len(_d.get("standard_name",""))-len(name))
+                standard_set = set(_d.get("standard_name",""))
+                name_set = set(name)
+                _d["length_dis"] = len(standard_set&name_set)/max(len(standard_set)+len(name_set),1)
+            final_list.sort(key=lambda x:x.get("length_dis",0),reverse=True)
             final_list.sort(key=lambda x:x.get("level",1))
             try:
                 db.set(_md5,json.dumps(final_list))
@@ -250,7 +253,7 @@ def is_contain(source,target,min_len=2):
         return True
     return False
 
-def check_char(source,target,chat_pattern=re.compile("^[a-zA-Z0-9]+$"),find_pattern=re.compile("(?P<product>[a-zA-Z0-9]+)")):
+def check_char(source,target,chat_pattern=re.compile("^[a-zA-Z0-9\-]+$"),find_pattern=re.compile("(?P<product>[a-zA-Z0-9-]+)")):
     if re.search(chat_pattern,source) is not None or re.search(chat_pattern,target) is not None:
         a = set(re.findall(find_pattern,source))
         b = set(re.findall(find_pattern,target))
@@ -306,6 +309,7 @@ def check_brand(source,target,remove_words):
             if str(source).find(_s)>=0:
                 return False
 
+
     max_len = max(len(source),len(target))
     min_len = min(len(source),len(target))
 
@@ -335,6 +339,8 @@ def check_brand(source,target,remove_words):
 
             if is_similar(source_c,target_c,min_ratio):
                 return True
+        else:
+            return False
 
     if has_same_specs_count(source,target):
 
@@ -517,7 +523,7 @@ def clean_product_brand(product_brand):
     return brand
 
 
-def clean_product_specs(product_specs,_PATTERN = re.compile("[^A-Za-z0-9-\\/()().]|^[\\/.-]+")):
+def clean_product_specs(product_specs,_PATTERN = re.compile("[^A-Za-z0-9-\\/()().×*]|^[\\/.-]+")):
     '''
     clean before insert
     :param product_specs:
@@ -560,12 +566,12 @@ def clean_product_quantity(product_quantity):
     return ""
 
 if __name__ == '__main__':
-    # print(check_brand('杭州郎基','杭州利华'))
+    # print(check_brand('DYW-JY-T01-A1(定制)','JY',''))
     # print(check_product("医用冷藏箱","医用","a|"))
 
     # print(re.split("[^\u4e00-\u9fff]",'128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10'))
     # import Levenshtein
     # print(Levenshtein.ratio('助听器','助行器'))
     # print(clean_product_specs("//4008SverssionV10"))
-    print(is_legal_brand(getConnect_ots(),"医用外科口罩"))
+    print(is_legal_brand(getConnect_ots(),"保健"))
     # print(check_specs("500ml","3500ml"))

+ 693 - 97
BaseDataMaintenance/maintenance/product/product_dict.py

@@ -24,7 +24,7 @@ import requests
 from random import randint
 
 
-IS_SYNCHONIZED = 2
+IS_SYNCHONIZED = 3
 
 class Product_Dict_Manager():
 
@@ -104,10 +104,13 @@ class Product_Dict_Manager():
     def embedding_producer(self,columns=[DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_PARENT_ID,DOCUMENT_PRODUCT_DICT_GRADE,DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,DOCUMENT_PRODUCT_DICT_LEVEL]):
 
         bool_query = BoolQuery(
-            must_queries=[RangeQuery(DOCUMENT_PRODUCT_DICT_GRADE,3,5,True,True)],
+            must_queries=[
+                TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_BASE),
+                RangeQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE,3,5,True,True)
+            ],
             must_not_queries=[TermQuery(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED,IS_SYNCHONIZED)])
 
-        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
                                                                             SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED)]),limit=100,get_total_count=True),
                                                                             columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
 
@@ -116,18 +119,17 @@ class Product_Dict_Manager():
             self.queue_product_dict.put(_d)
 
         while next_token:
-            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
                                                                                 SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                 columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
             list_dict = getRow_ots(rows)
             for _d in list_dict:
                 self.queue_product_dict.put(_d)
-            if self.queue_product_dict.qsize()>=1000:
+            if self.queue_product_dict.qsize()>=10000:
                 break
         log("product_dict embedding total_count:%d"%total_count)
 
 
-
     def embedding_comsumer(self):
         def handle(item,result_queue):
             try:
@@ -141,10 +143,9 @@ class Product_Dict_Manager():
                 remove_words = item.get(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,"")
                 level = item.get(DOCUMENT_PRODUCT_DICT_LEVEL,1)
 
-
                 if insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_words,level):
 
-                    _pd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:_id,DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED})
+                    _pd = Document_product_dict_interface({DOCUMENT_PRODUCT_DICT_ID:_id,DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED})
                     _pd.update_row(self.ots_client)
 
 
@@ -294,7 +295,6 @@ class Product_Dict_Manager():
 
 
     def make_query(self,name,column,query_type,min_len,strides):
-
         should_q = []
         strides_spce = len(name)-min_len+1
         for _i in range(min(strides_spce,strides)):
@@ -308,9 +308,8 @@ class Product_Dict_Manager():
         return None
 
 
-
-
-    def process_history(self,list_name,grade,action):
+    def process_history_by_name(self,list_name,grade,action):
+        assert action in [DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_UPDATE,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE]
         if grade==NAME_GRADE:
             self.process_history_name(list_name,action)
         elif grade==BRAND_GRADE:
@@ -318,6 +317,315 @@ class Product_Dict_Manager():
         elif grade==SPECS_GRADE:
             self.process_history_specs(list_name,action)
 
+    def process_history_by_standard_name(self,name,grade,list_name,action):
+        assert action in [DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE]
+        if grade==NAME_GRADE:
+            if action==DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT:
+                for n_name in list_name:
+                    bool_query = self.make_query(n_name,DOCUMENT_PRODUCT_NAME,TermQuery,len(n_name),5)
+                    if bool_query is not None:
+                        _query = bool_query
+                        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                            SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                            columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_NAME_ID],return_type=ColumnReturnType.SPECIFIED))
+
+                        list_data = getRow_ots(rows)
+                        total_count = total_count
+                        while next_token:
+                            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                                columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_NAME_ID],return_type=ColumnReturnType.SPECIFIED))
+                            list_data.extend(getRow_ots(rows))
+                        for _d in list_data:
+                            dict_name_id = _d.get(DOCUMENT_PRODUCT_DICT_NAME_ID)
+                            for dict_id in [dict_name_id]:
+                                if dict_id is not None and dict_id!="":
+                                    dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_id})
+                                    self.recurse_delete_dict(dict_name_id)
+                                    dpd.delete_row(self.ots_client)
+
+                            _id = _d.get(DOCUMENT_PRODUCT_ID)
+                            original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+                            self.rerun(_id,original_id)
+                    log("%s insert standard_alias %s exists %d counts "%(name,n_name,total_count))
+                    if total_count==0:
+                        self.process_history_name([n_name],action)
+
+            elif action==DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE:
+                for n_name in list_name:
+                    bool_query = self.make_query(n_name,DOCUMENT_PRODUCT_ORIGINAL_NAME,MatchPhraseQuery,len(n_name),5)
+                    if bool_query is not None:
+                        _query = BoolQuery(must_queries=[
+                            TermQuery(DOCUMENT_PRODUCT_NAME,name),
+                            bool_query
+                        ])
+                        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                            SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                            columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_NAME_ID],return_type=ColumnReturnType.SPECIFIED))
+                        list_data = getRow_ots(rows)
+                        while next_token:
+                            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                                columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_NAME_ID],return_type=ColumnReturnType.SPECIFIED))
+                            list_data.extend(getRow_ots(rows))
+                        for _d in list_data:
+                            dict_name_id = _d.get(DOCUMENT_PRODUCT_DICT_NAME_ID)
+                            if dict_name_id is not None and dict_name_id!="":
+                                _query = BoolQuery(must_queries=[
+                                    TermQuery(DOCUMENT_PRODUCT_DICT_NAME_ID,dict_name_id)
+                                ])
+                                rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                    SearchQuery(_query,get_total_count=True))
+                                if total_count==1:
+                                    dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_name_id})
+                                    self.recurse_delete_dict(dict_name_id)
+                                    dpd.delete_row(self.ots_client)
+
+                            _id = _d.get(DOCUMENT_PRODUCT_ID)
+                            original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+                            self.rerun(_id,original_id)
+        if grade==BRAND_GRADE:
+            if action==DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT:
+                for n_name in list_name:
+                    bool_query = self.make_query(n_name,DOCUMENT_PRODUCT_BRAND,TermQuery,len(n_name),5)
+                    if bool_query is not None:
+                        _query = bool_query
+                        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                            SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                            columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_BRAND_ID],return_type=ColumnReturnType.SPECIFIED))
+                        list_data = getRow_ots(rows)
+                        while next_token:
+                            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                                columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_BRAND_ID],return_type=ColumnReturnType.SPECIFIED))
+                            list_data.extend(getRow_ots(rows))
+                        for _d in list_data:
+                            dict_brand_id = _d.get(DOCUMENT_PRODUCT_DICT_BRAND_ID)
+                            for dict_id in [dict_brand_id]:
+                                if dict_id is not None and dict_id!="":
+                                    dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_id})
+                                    self.recurse_delete_dict(dict_brand_id)
+                                    dpd.delete_row(self.ots_client)
+                            _id = _d.get(DOCUMENT_PRODUCT_ID)
+                            original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+                            self.rerun(_id,original_id)
+                    log("%s insert standard_alias %s exists %d counts "%(name,n_name,total_count))
+                    if total_count==0:
+                        self.process_history_brand([n_name],action)
+
+            elif action==DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE:
+                for n_name in list_name:
+                    bool_query = self.make_query(n_name,DOCUMENT_PRODUCT_ORIGINAL_BRAND,MatchPhraseQuery,4,5)
+                    if bool_query is not None:
+                        _query = BoolQuery(must_queries=[
+                            TermQuery(DOCUMENT_PRODUCT_BRAND,name),
+                            bool_query
+                        ])
+                        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                            SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                            columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_BRAND_ID],return_type=ColumnReturnType.SPECIFIED))
+                        list_data = getRow_ots(rows)
+                        while next_token:
+                            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                                columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_BRAND_ID],return_type=ColumnReturnType.SPECIFIED))
+                            list_data.extend(getRow_ots(rows))
+                        for _d in list_data:
+                            dict_brand_id = _d.get(DOCUMENT_PRODUCT_DICT_BRAND_ID)
+                            if dict_brand_id is not None and dict_brand_id!="":
+                                _query = BoolQuery(must_queries=[
+                                    TermQuery(DOCUMENT_PRODUCT_DICT_BRAND_ID,dict_brand_id)
+                                ])
+                                rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                    SearchQuery(_query,get_total_count=True))
+                                if total_count==1:
+                                    dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_brand_id})
+                                    self.recurse_delete_dict(dict_brand_id)
+                                    dpd.delete_row(self.ots_client)
+
+                            _id = _d.get(DOCUMENT_PRODUCT_ID)
+                            original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+                            self.rerun(_id,original_id)
+        if grade==SPECS_GRADE:
+            if action==DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT:
+                for n_name in list_name:
+                    bool_query = self.make_query(n_name,DOCUMENT_PRODUCT_SPECS,TermQuery,len(n_name),5)
+                    if bool_query is not None:
+                        _query = bool_query
+                        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                            SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                            columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_SPECS_ID],return_type=ColumnReturnType.SPECIFIED))
+                        list_data = getRow_ots(rows)
+                        while next_token:
+                            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                                columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_SPECS_ID],return_type=ColumnReturnType.SPECIFIED))
+                            list_data.extend(getRow_ots(rows))
+                        for _d in list_data:
+                            dict_specs_id = _d.get(DOCUMENT_PRODUCT_DICT_SPECS_ID)
+                            for dict_id in [dict_specs_id]:
+                                if dict_id is not None and dict_id!="":
+                                    dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_id})
+                                    self.recurse_delete_dict(dict_specs_id)
+                                    dpd.delete_row(self.ots_client)
+                            _id = _d.get(DOCUMENT_PRODUCT_ID)
+                            original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+                            self.rerun(_id,original_id)
+                    log("%s insert standard_alias %s exists %d counts "%(name,n_name,total_count))
+                    if total_count==0:
+                        self.process_history_specs([n_name],action)
+
+            elif action==DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE:
+                _query = BoolQuery(must_queries=[
+                    TermQuery(DOCUMENT_PRODUCT_SPECS,name),
+                ])
+                rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                    SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                    columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_SPECS_ID],return_type=ColumnReturnType.SPECIFIED))
+                list_data = getRow_ots(rows)
+                while next_token:
+                    rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                        SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                        columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_SPECS_ID],return_type=ColumnReturnType.SPECIFIED))
+                    list_data.extend(getRow_ots(rows))
+                for _d in list_data:
+                    dict_specs_id = _d.get(DOCUMENT_PRODUCT_DICT_SPECS_ID)
+                    if dict_specs_id is not None and dict_specs_id!="":
+                        _query = BoolQuery(must_queries=[
+                            TermQuery(DOCUMENT_PRODUCT_DICT_SPECS_ID,dict_brand_id)
+                        ])
+                        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                            SearchQuery(_query,get_total_count=True))
+                        if total_count==1:
+                            dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_specs_id})
+                            self.recurse_delete_dict(dict_specs_id)
+                            dpd.delete_row(self.ots_client)
+
+                    _id = _d.get(DOCUMENT_PRODUCT_ID)
+                    original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+                    self.rerun(_id,original_id)
+
+
+    def process_history_by_remove_words(self,name,grade,list_name,action):
+        assert action in [DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE]
+        if grade==NAME_GRADE:
+            if action==DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT:
+                for n_name in list_name:
+                    bool_query = self.make_query(n_name,DOCUMENT_PRODUCT_ORIGINAL_NAME,MatchPhraseQuery,len(n_name),5)
+                    if bool_query is not None:
+                        _query = BoolQuery(must_queries=[
+                            TermQuery(DOCUMENT_PRODUCT_NAME,name),
+                            bool_query
+                        ])
+                        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                            SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                            columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_NAME_ID],return_type=ColumnReturnType.SPECIFIED))
+                        list_data = getRow_ots(rows)
+
+
+                        while next_token:
+                            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                                columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_NAME_ID],return_type=ColumnReturnType.SPECIFIED))
+                            list_data.extend(getRow_ots(rows))
+
+                        for _d in list_data:
+                            dict_name_id = _d.get(DOCUMENT_PRODUCT_DICT_NAME_ID)
+                            if dict_name_id is not None and dict_name_id!="":
+                                _query = BoolQuery(must_queries=[
+                                    TermQuery(DOCUMENT_PRODUCT_DICT_NAME_ID,dict_name_id)
+                                ])
+                                rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                    SearchQuery(_query,get_total_count=True))
+                                if total_count==1:
+                                    dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_name_id})
+                                    self.recurse_delete_dict(dict_name_id)
+                                    dpd.delete_row(self.ots_client)
+
+                            _id = _d.get(DOCUMENT_PRODUCT_ID)
+                            original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+                            self.rerun(_id,original_id)
+
+            elif action == DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE:
+                self.process_history_name(list_name,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT)
+        if grade==BRAND_GRADE:
+            if action==DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT:
+                for n_name in list_name:
+                    bool_query = self.make_query(n_name,DOCUMENT_PRODUCT_ORIGINAL_BRAND,MatchPhraseQuery,len(n_name),5)
+                    if bool_query is not None:
+                        _query = BoolQuery(must_queries=[
+                            TermQuery(DOCUMENT_PRODUCT_BRAND,name),
+                            bool_query
+                        ])
+                        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                            SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                            columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_BRAND_ID],return_type=ColumnReturnType.SPECIFIED))
+                        list_data = getRow_ots(rows)
+                        while next_token:
+                            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                                columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_BRAND_ID],return_type=ColumnReturnType.SPECIFIED))
+                            list_data.extend(getRow_ots(rows))
+
+                        for _d in list_data:
+                            dict_brand_id = _d.get(DOCUMENT_PRODUCT_DICT_BRAND_ID)
+                            if dict_brand_id is not None and dict_brand_id!="":
+                                _query = BoolQuery(must_queries=[
+                                    TermQuery(DOCUMENT_PRODUCT_DICT_BRAND_ID,dict_brand_id)
+                                ])
+                                rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                    SearchQuery(_query,get_total_count=True))
+                                if total_count==1:
+                                    dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_brand_id})
+                                    self.recurse_delete_dict(dict_brand_id)
+                                    dpd.delete_row(self.ots_client)
+
+                            _id = _d.get(DOCUMENT_PRODUCT_ID)
+                            original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+                            self.rerun(_id,original_id)
+
+            elif action == DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE:
+                self.process_history_brand(list_name,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT)
+        if grade==SPECS_GRADE:
+            if action==DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT:
+                for n_name in list_name:
+                    bool_query = self.make_query(n_name,DOCUMENT_PRODUCT_ORIGINAL_SPECS,MatchPhraseQuery,len(n_name),5)
+                    if bool_query is not None:
+                        _query = BoolQuery(must_queries=[
+                            TermQuery(DOCUMENT_PRODUCT_SPECS,name),
+                            bool_query
+                        ])
+                        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                            SearchQuery(_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                            columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_SPECS_ID],return_type=ColumnReturnType.SPECIFIED))
+                        list_data = getRow_ots(rows)
+                        while next_token:
+                            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                                columns_to_get=ColumnsToGet([DOCUMENT_PRODUCT_ORIGINAL_ID,DOCUMENT_PRODUCT_DICT_SPECS_ID],return_type=ColumnReturnType.SPECIFIED))
+                            list_data.extend(getRow_ots(rows))
+
+                        for _d in list_data:
+                            dict_specs_id = _d.get(DOCUMENT_PRODUCT_DICT_SPECS_ID)
+                            if dict_specs_id is not None and dict_specs_id!="":
+                                _query = BoolQuery(must_queries=[
+                                    TermQuery(DOCUMENT_PRODUCT_DICT_SPECS_ID,dict_brand_id)
+                                ])
+                                rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                                    SearchQuery(_query,get_total_count=True))
+                                if total_count==1:
+                                    dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:dict_specs_id})
+                                    self.recurse_delete_dict(dict_specs_id)
+                                    dpd.delete_row(self.ots_client)
+
+                            _id = _d.get(DOCUMENT_PRODUCT_ID)
+                            original_id = _d.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+                            self.rerun(_id,original_id)
+
+            elif action == DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE:
+                self.process_history_specs(list_name,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT)
+
+
     def exists_records(self,name,grade,create_time):
         term_columns = None
         if grade==NAME_GRADE:
@@ -339,8 +647,7 @@ class Product_Dict_Manager():
         return False
 
 
-    def act_insert(self,name,alias,grade,original_id,parent_id,standard_alias,create_time):
-
+    def act_insert(self,name,alias,grade,original_id,parent_id,standard_alias,create_time,remove_words,level):
 
         #update document_product_dict
         if original_id is None or original_id=="":
@@ -363,14 +670,24 @@ class Product_Dict_Manager():
             _dpd.update_row(self.ots_client)
 
         # search interface if name and grade exists then update document_product_dict and return
-        bool_query = BoolQuery(must_queries=[
-            TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_NAME,name),
-            TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE,grade),
-            RangeQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS,201,301)
-        ])
-        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
-                                                                            SearchQuery(bool_query,get_total_count=True))
-        if total_count>0:
+
+        interface_id = get_milvus_product_dict_id(name)
+        _interface_d = {
+            DOCUMENT_PRODUCT_DICT_INTERFACE_ID:interface_id,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:alias,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:name,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:randint(201,300),
+            DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_BASE,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:grade,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:parent_id,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS:standard_alias,
+            DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
+            DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+            DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS:remove_words,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_LEVEL:level
+        }
+        _dpdi = Document_product_dict_interface(_interface_d)
+        if _dpdi.exists_row(self.ots_client):
             return
 
         list_name = []
@@ -393,90 +710,258 @@ class Product_Dict_Manager():
 
         #judge whether there exists records before this record created,if not process the history data
         if not self.exists_records(name,grade,create_time):
-            self.process_history(list_name,grade,"insert")
+            self.process_history_by_name(list_name,grade,"insert")
 
+        _dpdi.update_row(self.ots_client)
 
-    def act_update(self,name,alias,grade,original_id,parent_id,standard_alias,create_time):
-        # check whether there are change variable
-        if original_id is None or original_id=="":
-            return
-        _d = {DOCUMENT_PRODUCT_DICT_ID:original_id}
-        dpd = Document_product_dict(_d)
-        if not dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_ALIAS,DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,DOCUMENT_PRODUCT_DICT_PARENT_ID,DOCUMENT_PRODUCT_DICT_CREATE_TIME,DOCUMENT_PRODUCT_DICT_UPDATE_TIME,DOCUMENT_PRODUCT_DICT_STATUS],True):
-            return
+    def get_updated_record(self,alias,standard_alias,remove_words,level,original_alias,original_standard_alias,original_remove_words,original_level):
 
-        if parent_id is None or parent_id=="":
-            parent_id = dpd.getProperties().get(DOCUMENT_PRODUCT_DICT_PARENT_ID)
-        old_name = dpd.getProperties().get(DOCUMENT_PRODUCT_DICT_NAME)
-        old_name_set = set([old_name])
-        _alias = dpd.getProperties().get(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS)
-        if _alias is not None:
-            for s in _alias.split(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS_SEPARATOR):
-                if s!="":
-                    old_name_set.add(s)
-
-        new_name_set = set([name])
-        if standard_alias is not None:
-            for s in standard_alias.split(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS_SEPARATOR):
-                if s!="":
-                    new_name_set.add(s)
-
-        if old_name!=name:
-            new_id = get_document_product_dict_id(parent_id, name)
-        else:
-            new_id = original_id
+        original_alias_set = set()
+        original_standard_alias_set = set()
+        original_remove_words_set = set()
 
+        if original_alias is not None and original_alias!="":
+            _split = original_alias.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+            for _s in _split:
+                _s = _s.strip()
+                if _s=="":
+                    continue
+                original_alias_set.add(_s)
+        if original_standard_alias is not None and original_standard_alias!="":
+            _split = original_standard_alias.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+            for _s in _split:
+                _s = _s.strip()
+                if _s=="":
+                    continue
+                original_standard_alias_set.add(_s)
+        if original_remove_words is not None and original_remove_words!="":
+            _split = original_remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+            for _s in _split:
+                _s = _s.strip()
+                if _s=="":
+                    continue
+                original_remove_words_set.add(_s)
 
-        Coll,_ = self.get_collection(grade)
+        new_alias_set = set()
+        new_standard_alias_set = set()
+        new_remove_words_set = set()
+        if alias is not None and alias!="":
+            if alias[0]=="+":
+                new_alias_set |= original_alias_set
+                _split = alias[1:].split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+                for _s in _split:
+                    _s = _s.strip()
+                    if _s=="":
+                        continue
+                    new_alias_set.add(_s)
+            elif alias[0]=="-":
+                new_alias_set |= original_alias_set
+                _split = alias[1:].split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+                for _s in _split:
+                    _s = _s.strip()
+                    if _s=="":
+                        continue
+                    if _s in new_alias_set:
+                        new_alias_set.remove(_s)
+            else:
+                _split = alias.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+                for _s in _split:
+                    _s = _s.strip()
+                    if _s=="":
+                        continue
+                    new_alias_set.add(_s)
+        else:
+            new_alias_set = original_alias_set
 
-        delete_names = list(old_name_set-new_name_set)
-        insert_names = list(new_name_set-old_name_set)
-        # update the milvus
-        if len(delete_names)>0:
-            for _name in delete_names:
-                delete_record_from_milvus(Coll,_name,"")
-            time.sleep(1)
-        if len(insert_names)>0:
-            insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias)
+        if standard_alias is not None and standard_alias!="":
+            if standard_alias[0]=="+":
+                new_standard_alias_set |= original_standard_alias_set
+                _split = standard_alias[1:].split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+                for _s in _split:
+                    _s = _s.strip()
+                    if _s=="":
+                        continue
+                    new_standard_alias_set.add(_s)
+            elif standard_alias[0]=="-":
+                new_standard_alias_set |= original_standard_alias_set
+                _split = standard_alias[1:].split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+                for _s in _split:
+                    _s = _s.strip()
+                    if _s=="":
+                        continue
+                    if _s in new_standard_alias_set:
+                        new_standard_alias_set.remove(_s)
+            else:
+                _split = standard_alias.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+                for _s in _split:
+                    _s = _s.strip()
+                    if _s=="":
+                        continue
+                    new_standard_alias_set.add(_s)
+        else:
+            new_standard_alias_set = original_standard_alias_set
+
+        if remove_words is not None and remove_words!="":
+            if remove_words[0]=="+":
+                new_remove_words_set |= original_remove_words_set
+                _split = remove_words[1:].split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+                for _s in _split:
+                    _s = _s.strip()
+                    if _s=="":
+                        continue
+                    new_remove_words_set.add(_s)
+            elif remove_words[0]=="-":
+                new_remove_words_set |= original_remove_words_set
+                _split = remove_words[1:].split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+                for _s in _split:
+                    _s = _s.strip()
+                    if _s=="":
+                        continue
+                    if _s in new_remove_words_set:
+                        new_remove_words_set.remove(_s)
+            else:
+                _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+                for _s in _split:
+                    _s = _s.strip()
+                    if _s=="":
+                        continue
+                    new_remove_words_set.add(_s)
+        else:
+            new_remove_words_set = original_remove_words_set
+        update_flag = False
+        milvus_update_flag = False
+        if len(new_alias_set&original_alias_set)!=len(new_alias_set):
+            update_flag = True
+        if len(new_standard_alias_set&original_remove_words_set)!=len(new_standard_alias_set):
+            update_flag = True
+            milvus_update_flag = True
+        if len(new_remove_words_set&original_remove_words_set)!=len(new_remove_words_set):
+            update_flag = True
+            milvus_update_flag = True
+        if str(level)!=str(original_level):
+            update_flag = True
+            milvus_update_flag = True
+
+        return update_flag,milvus_update_flag,original_alias_set,original_standard_alias_set,original_remove_words_set,new_alias_set,new_standard_alias_set,new_remove_words_set
+
+    def act_update(self,name,alias,grade,original_id,parent_id,standard_alias,create_time,remove_words,level):
+        # check whether there are change variable
 
-        # process history
-        if len(delete_names)>0:
-            self.process_history([old_name],grade,"update")
-        if len(insert_names)>0:
-            self.process_history(insert_names,grade,"insert")
+        _interface_id = get_milvus_product_dict_id(name)
+        _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:_interface_id}
+        _dpdi = Document_product_dict_interface(_d)
+        if not _dpdi.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS,DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS,DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS,DOCUMENT_PRODUCT_DICT_INTERFACE_LEVEL],True):
+            return
 
+        original_alias = _dpdi.getProperties().get(DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS)
+        original_standard_alias = _dpdi.getProperties().get(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS)
+        original_remove_words = _dpdi.getProperties().get(DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS)
+        original_level = _dpdi.getProperties().get(DOCUMENT_PRODUCT_DICT_INTERFACE_LEVEL)
 
+        update_flag,milvus_update_flag,original_alias_set,original_standard_alias_set,original_remove_words_set,new_alias_set,new_standard_alias_set,new_remove_words_set = self.get_updated_record(alias,standard_alias,remove_words,level,original_alias,original_standard_alias,original_remove_words,original_level)
 
-        # update document_product_dict
-        _d = {DOCUMENT_PRODUCT_DICT_ID:new_id,
-              DOCUMENT_PRODUCT_DICT_NAME:name,
-              DOCUMENT_PRODUCT_DICT_GRADE:grade,
-              DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
-              DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS:standard_alias}
+        if not update_flag:
+            return
 
-        if alias is not None and alias!="":
-            _d[DOCUMENT_PRODUCT_DICT_ALIAS] = alias
-        if parent_id is not None and parent_id!="":
-            _d[DOCUMENT_PRODUCT_DICT_PARENT_ID] = parent_id
-        if old_name!=name:
-            _d[DOCUMENT_PRODUCT_DICT_CREATE_TIME] = dpd.getProperties().get(DOCUMENT_PRODUCT_DICT_CREATE_TIME)
-            _d[DOCUMENT_PRODUCT_DICT_UPDATE_TIME] = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
-            _d[DOCUMENT_PRODUCT_DICT_STATUS] = dpd.getProperties().get(DOCUMENT_PRODUCT_DICT_STATUS)
+        interface_id = get_milvus_product_dict_id(name)
+        final_alias = DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR.join(list(new_alias_set))
+        final_standard_alias = DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR.join(list(new_standard_alias_set))
+        final_remove_words = DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR.join(list(new_remove_words_set))
+        if parent_id is None:
+            parent_id = ""
+        if level is None or level=="":
+            level = 1
 
 
+        delete_standard_names = list(original_standard_alias_set-new_standard_alias_set)
+        insert_standard_names = list(new_standard_alias_set-original_standard_alias_set)
 
-        dpd = Document_product_dict(_d)
-        dpd.update_row(self.ots_client)
+        delete_remove_words = list(original_remove_words_set-new_remove_words_set)
+        insert_remove_words = list(new_remove_words_set-original_remove_words_set)
 
-        if old_name!=name:
-            # in the case of name changed ,delete the old name row
-            _d = {DOCUMENT_PRODUCT_DICT_ID:original_id}
-            dpd = Document_product_dict(_d)
+        log("update_interface delete_standard_names:%s insert_standard_names:%s delete_remove_words:%s insert_remove_words:%s"%(str(delete_standard_names),str(insert_standard_names),str(delete_remove_words),str(insert_remove_words)))
+        # update the milvus
+        Coll,_ = self.get_collection(grade)
+        if milvus_update_flag:
+            insert_new_record_to_milvus(Coll,name,grade,parent_id,final_standard_alias,final_remove_words,level)
+        if len(delete_standard_names)>0:
+            for _name in delete_standard_names:
+                delete_record_from_milvus(Coll,_name,"")
 
-            self.recurse_update_dict(original_id,new_id)
+        # update document_product_dict
+        # update alias
+        if len(new_alias_set&original_alias_set)!=len(new_alias_set):
+            bool_query = BoolQuery(must_queries=[
+                TermQuery(DOCUMENT_PRODUCT_DICT_NAME,name),
+                TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,grade)
+            ])
+            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                ColumnsToGet(return_type=ColumnReturnType.NONE))
+            list_data = getRow_ots(rows)
+            log("update dict table alias %d counts"%(total_count))
+            while next_token:
+                rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                    ColumnsToGet(return_type=ColumnReturnType.NONE))
+                list_data.extend(getRow_ots(rows))
+            for _data in list_data:
+                dpd = Document_product_dict(_data)
+                dpd.setValue(DOCUMENT_PRODUCT_DICT_ALIAS,final_alias,True)
+                dpd.update_row(self.ots_client)
 
-            dpd.delete_row(self.ots_client)
-            # change the next level parent_id
+        #if merge current names then update dict
+        for _name in insert_standard_names:
+            if _name==name:
+                continue
+            bool_query = BoolQuery(must_queries=[
+                TermQuery(DOCUMENT_PRODUCT_DICT_NAME,_name),
+                TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,grade)
+            ])
+            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                                ColumnsToGet(return_type=ColumnReturnType.NONE))
+            list_data = getRow_ots(rows)
+            log("delete dict table %d counts"%(total_count))
+            while next_token:
+                rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                                    ColumnsToGet(return_type=ColumnReturnType.NONE))
+                list_data.extend(getRow_ots(rows))
+            for _data in list_data:
+                dpd = Document_product_dict(_data)
+                _id = _data.get(DOCUMENT_PRODUCT_DICT_ID)
+                log("delete id:%s"%(_id))
+                self.recurse_delete_dict(_id)
+                dpd.delete_row(self.ots_client)
+
+        # process history
+        if len(delete_standard_names)>0:
+            self.process_history_by_standard_name(name,grade,delete_standard_names,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE)
+        if len(insert_standard_names)>0:
+            self.process_history_by_standard_name(name,grade,insert_standard_names,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT)
+
+        if len(delete_remove_words)>0:
+            self.process_history_by_remove_words(name,grade,delete_remove_words,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE)
+        if len(insert_remove_words)>0:
+            self.process_history_by_remove_words(name,grade,insert_remove_words,DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT)
+
+        _interface_d = {
+            DOCUMENT_PRODUCT_DICT_INTERFACE_ID:interface_id,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:final_alias,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:name,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:randint(201,300),
+            DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_BASE,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:grade,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:parent_id,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS:final_standard_alias,
+            DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
+            DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+            DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS:final_remove_words,
+            DOCUMENT_PRODUCT_DICT_INTERFACE_LEVEL:level
+        }
+        _dpdi = Document_product_dict_interface(_interface_d)
+        _dpdi.update_row(self.ots_client)
 
 
     def recurse_update_dict(self,parent_id,new_parent_id):
@@ -527,6 +1012,10 @@ class Product_Dict_Manager():
                                                                                 columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
             list_data.extend(getRow_ots(rows))
 
+        interface_id = get_milvus_product_dict_id(name)
+
+
+
         #delete milvus records
         Coll,_ = self.get_collection(grade)
 
@@ -534,7 +1023,7 @@ class Product_Dict_Manager():
         time.sleep(1)
 
         #process_history data
-        self.process_history([name],grade,"delete")
+        self.process_history_by_name([name],grade,"delete")
 
         #delete document_product_dict
         log("delete document_product_dict name:%s grade:%s count:%s"%(str(name),str(grade),str(len(list_data))))
@@ -547,6 +1036,12 @@ class Product_Dict_Manager():
             dpd = Document_product_dict(_d)
             dpd.delete_row(self.ots_client)
 
+        _interface_d = {
+            DOCUMENT_PRODUCT_DICT_INTERFACE_ID:interface_id,
+        }
+        _dpdi = Document_product_dict_interface(_interface_d)
+        _dpdi.delete_row(self.ots_client)
+
 
 
     def recurse_delete_dict(self,id):
@@ -604,15 +1099,17 @@ class Product_Dict_Manager():
             alias = item.get(DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS)
             grade = item.get(DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE)
             original_id = item.get(DOCUMENT_PRODUCT_DICT_ORIGINAL_ID)
-            parent_id = item.get(DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID)
+            parent_id = item.get(DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID,"")
             standard_alias = item.get(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS)
             create_time = item.get(DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME)
+            remove_words = item.get(DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS,'')
+            level = item.get(DOCUMENT_PRODUCT_DICT_INTERFACE_LEVEL,1)
 
             if name is not None and len(name)>1 and len(name)<MAX_NAME_LENGTH:
                 if action=="insert":
-                    self.act_insert(name,alias,grade,original_id,parent_id,standard_alias,create_time)
+                    self.act_insert(name,alias,grade,original_id,parent_id,standard_alias,create_time,remove_words,level)
                 elif action=="update":
-                    self.act_update(name,alias,grade,original_id,parent_id,standard_alias,create_time)
+                    self.act_update(name,alias,grade,original_id,parent_id,standard_alias,create_time,remove_words,level)
                 elif action=="delete":
                     self.act_delete(name,alias,grade,original_id,parent_id,standard_alias,create_time)
 
@@ -882,7 +1379,18 @@ def insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_
                         [level]
                         ]
                 insert_embedding(Coll,data)
-        return True
+        while 1:
+            try:
+                log("milvus insert wait for done")
+                list_result = Coll.query(expr=expr,output_fields=["standard_name"])
+                log("list_result"+str(list_result)+str(type(list_result[0])))
+                if len(list_result)==1:
+                    if list_result[0].get("standard_name","")==name:
+                        log("milvus insert done")
+                        return True
+                time.sleep(1)
+            except Exception as e:
+                traceback.print_exc()
 
 def delete_record_from_milvus(Coll,name,standard_alias):
 
@@ -906,6 +1414,12 @@ def delete_record_from_milvus(Coll,name,standard_alias):
 
             expr = " ots_id in ['%s']"%_id
             Coll.delete(expr)
+    while 1:
+        if len(Coll.query(expr=expr))==0:
+            return
+        else:
+            log("milvus delete wait for done")
+            time.sleep(1)
 
 
 
@@ -923,7 +1437,7 @@ def dict_interface_delete(name,grade,ots_client = getConnect_ots()):
 
 def interface_deletes():
     a = '''
-    明细
+    按采购需求执行
     '''
     grade = 4
     ots_client=getConnect_ots()
@@ -934,6 +1448,26 @@ def interface_deletes():
         print(s)
         dict_interface_delete(s,grade,ots_client)
 
+def interface_update():
+    name = "保健"
+    new_standard_alias = ""
+    new_remove_words = "+设备"
+    grade = 4
+    ots_client = getConnect_ots()
+
+
+    from uuid import uuid4
+    _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:name,
+          DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
+          DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:grade,
+          DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
+          DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"update",
+          DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS:new_standard_alias,
+          DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS:new_remove_words,
+          DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S")}
+    dpdi = Document_product_dict_interface(_d)
+    dpdi.update_row(ots_client)
+
 def clean_brands():
     from queue import Queue as TQueue
     task_queue = TQueue()
@@ -1011,8 +1545,70 @@ def clean_brands():
         for _name in list_illegal:
             f.write("%s\n"%(_name))
 
+def clean_product_dict():
+    ots_client = getConnect_ots()
+    bool_query = BoolQuery(must_queries=[
+        RangeQuery("status",0)
+    ])
+    task_queue = Queue()
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),get_total_count=True,limit=100),
+                                                                   columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+    list_data = getRow_ots(rows)
+    for _data in list_data:
+        task_queue.put(_data)
+    print("%d/%d"%(task_queue.qsize(),total_count))
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
+                                                                       columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+        list_data = getRow_ots(rows)
+        for _data in list_data:
+            task_queue.put(_data)
+        print("%d/%d"%(task_queue.qsize(),total_count))
+
+    def _handle(item,result_queue):
+        _dpd = Document_product_dict(item)
+        _dpd.delete_row(ots_client)
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+
+def clean_product_dict_interface():
+    ots_client = getConnect_ots()
+    bool_query = BoolQuery(must_queries=[
+        BoolQuery(should_queries=[
+            TermQuery("action","insert"),
+            TermQuery("action","base")
+        ])
+    ])
+    task_queue = Queue()
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),get_total_count=True,limit=100),
+                                                                   columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+    list_data = getRow_ots(rows)
+    for _data in list_data:
+        task_queue.put(_data)
+    print("%d/%d"%(task_queue.qsize(),total_count))
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
+                                                                       columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+        list_data = getRow_ots(rows)
+        for _data in list_data:
+            task_queue.put(_data)
+        print("%d/%d"%(task_queue.qsize(),total_count))
+
+    def _handle(item,result_queue):
+        _dpd = Document_product_dict_interface(item)
+        _dpd.delete_row(ots_client)
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+
 if __name__ == '__main__':
     # start_embedding_product_dict()
-    interface_deletes()
+    # interface_deletes()
+    # interface_update()
     # clean_similar()
-    # clean_brands()
+    # clean_brands()
+    # clean_product_dict()
+    clean_product_dict_interface()

+ 204 - 52
BaseDataMaintenance/maintenance/product/products.py

@@ -13,6 +13,7 @@ from BaseDataMaintenance.model.ots.document_product_dict_interface import *
 from BaseDataMaintenance.model.ots.document import *
 from BaseDataMaintenance.model.ots.attachment import *
 from BaseDataMaintenance.model.ots.enterprise import *
+from BaseDataMaintenance.model.ots.project import *
 
 from tablestore import *
 
@@ -111,7 +112,10 @@ class Product_Manager(Product_Dict_Manager):
 
 
     def comsumer_handle(self,item,result_queue):
-        self.standardize(item)
+        try:
+            self.standardize(item)
+        except Exception as e:
+            traceback.print_exc()
 
 
 
@@ -137,11 +141,15 @@ class Product_Manager(Product_Dict_Manager):
 
         document_product_tmp = Document_product_tmp(tmp_dict)
 
+        tenderee = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_TENDEREE,"")
+
         name = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,"")
         brand = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,"")
         specs = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,"")
         parameters = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_PARAMETER,"")
 
+        name = name.replace(tenderee,"")
+        brand = brand.replace(tenderee,"")
 
         original_name = name
         original_brand = brand
@@ -177,7 +185,7 @@ class Product_Manager(Product_Dict_Manager):
                 remove_words = _search.get("remove_words")
 
                 if check_product(name,ots_name,remove_words):
-                    name_ots_id = ots_id
+                    name_ots_id = get_document_product_dict_id(ots_parent_id,standard_name)
                     original_name = name
                     new_name = standard_name
 
@@ -203,7 +211,7 @@ class Product_Manager(Product_Dict_Manager):
                     if check_product(name,ots_name,remove_words):
 
                         log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
-                        name_ots_id = ots_id
+                        name_ots_id = get_document_product_dict_id(ots_parent_id,standard_name)
                         original_name = name
                         new_name = standard_name
 
@@ -224,6 +232,9 @@ class Product_Manager(Product_Dict_Manager):
 
                 _find = False
                 for brand in l_brand:
+
+                    if len(brand)>100:
+                        continue
                     search_list = get_intellect_search(Coll,embedding_index_name,brand,BRAND_GRADE,self.search_params,output_fields,limit=10)
 
                     # log("search brand %s"%(brand))
@@ -243,6 +254,11 @@ class Product_Manager(Product_Dict_Manager):
                             if ots_name==new_name:
                                 continue
                             original_brand = brand
+                            if original_brand==original_name:
+                                if original_brand.find(ots_name)>=1:
+                                    continue
+                                if len(original_brand)<=3:
+                                    continue
                             new_brand = standard_name
 
                             log("checking brand %s succeed %s"%(brand,new_brand))
@@ -253,7 +269,7 @@ class Product_Manager(Product_Dict_Manager):
 
                                 _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
                                             DOCUMENT_PRODUCT_DICT_NAME:new_brand,
-                                            DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
+                                            DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_brand).lower()),
                                             DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
                                             DOCUMENT_PRODUCT_DICT_STATUS:1,
                                             DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
@@ -285,15 +301,24 @@ class Product_Manager(Product_Dict_Manager):
                         break
                 if not _find:
                     for brand in l_brand:
+                        if len(brand)>100:
+                            continue
                         if self.check_new_brand(brand):
                             new_brand = clean_product_brand(brand)
                             if new_brand=="":
                                 continue
                             original_brand = brand
+                            if original_brand==original_name:
+                                if new_name==original_brand:
+                                    continue
+                                if original_brand.find(new_brand)>=1:
+                                    continue
+                                if len(original_brand)<=3:
+                                    continue
                             log("adding new brand %s"%(str(new_brand)))
                             _d_brand = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
                                         DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_brand,
-                                        DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(str(brand).lower()),
+                                        DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(str(new_brand).lower()),
                                         DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:BRAND_GRADE,
                                         DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
                                         DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:name_ots_id,
@@ -314,6 +339,8 @@ class Product_Manager(Product_Dict_Manager):
                     l_brand = [brand]
 
                     for brand in l_brand:
+                        if len(brand)>100:
+                            continue
                         if _find:
                             break
 
@@ -332,7 +359,13 @@ class Product_Manager(Product_Dict_Manager):
                                 # log("check brand similar succeed:%s and %s"%(brand,ots_name))
                                 if ots_name==new_name:
                                     continue
-                                orignal_brand = brand
+
+                                original_brand = brand
+                                if original_brand==original_name:
+                                    if original_brand.find(ots_name)>=1:
+                                        continue
+                                    if len(original_brand)<=3:
+                                        continue
                                 new_brand = standard_name
 
                                 log("checking brand %s succeed %s"%(brand,new_brand))
@@ -343,7 +376,7 @@ class Product_Manager(Product_Dict_Manager):
 
                                     _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
                                                 DOCUMENT_PRODUCT_DICT_NAME:new_brand,
-                                                DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
+                                                DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_brand).lower()),
                                                 DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
                                                 DOCUMENT_PRODUCT_DICT_STATUS:1,
                                                 DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
@@ -405,6 +438,8 @@ class Product_Manager(Product_Dict_Manager):
                                 if check_specs(c_specs,ots_name):
                                     break_flag = True
                                     original_specs = c_specs
+                                    if standard_name==new_name:
+                                        continue
                                     new_specs = standard_name
                                     log("check_specs %s succeed %s"%(specs,new_specs))
 
@@ -415,7 +450,7 @@ class Product_Manager(Product_Dict_Manager):
 
                                         _d_specs = {DOCUMENT_PRODUCT_DICT_ID:specs_ots_id,
                                                     DOCUMENT_PRODUCT_DICT_NAME:new_specs,
-                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(specs).lower()),
+                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_specs).lower()),
                                                     DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
                                                     DOCUMENT_PRODUCT_DICT_STATUS:1,
                                                     DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
@@ -449,12 +484,16 @@ class Product_Manager(Product_Dict_Manager):
                         if is_legal_specs(specs) and len(specs)<MAX_NAME_LENGTH and len(specs)>=5:
                             debug("is_legal_specs")
                             original_specs = specs
+
                             new_specs = clean_product_specs(specs)
+                            if new_specs==new_name:
+                                new_specs = ""
+                                continue
                             # insert into document_product_dict a new record
                             # to update the document_product_dict which is builded for search
                             # add new specs
                             if brand_ots_id is not None and name_ots_id is not None:
-                                _md5 = get_document_product_dict_id(brand_ots_id,new_specs)
+                                specs_ots_id = get_document_product_dict_id(brand_ots_id,new_specs)
 
                                 # _d = {DOCUMENT_PRODUCT_DICT_ID:_md5,
                                 #       DOCUMENT_PRODUCT_DICT_NAME:new_specs,
@@ -525,13 +564,16 @@ class Product_Manager(Product_Dict_Manager):
                                     break_flag = True
                                     original_specs = c_specs
                                     new_specs = standard_name
+                                    if new_specs==new_name:
+                                        new_specs = ""
+                                        continue
                                     if brand_ots_id is not None:
                                         # judge if the specs which parent_id is brand_ots_id exists,insert one if not exists else update alias
                                         specs_ots_id = get_document_product_dict_id(brand_ots_id,new_specs)
 
                                         _d_specs = {DOCUMENT_PRODUCT_DICT_ID:specs_ots_id,
                                                     DOCUMENT_PRODUCT_DICT_NAME:new_specs,
-                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(specs).lower()),
+                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_specs).lower()),
                                                     DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
                                                     DOCUMENT_PRODUCT_DICT_STATUS:1,
                                                     DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
@@ -636,14 +678,16 @@ class Product_Manager(Product_Dict_Manager):
             if not is_legal_data:
                 _status = randint(501,550)
 
-            elif self.dumplicate(_product):
-                _status = randint(201,300)
-                save_product_tmp.setValue(DOCUMENT_PRODUCT_TMP_NEW_ID,new_id,True)
-
-                _product.update_row(self.ots_client)
-
             else:
-                _status = randint(451,500)
+                _flag,dump_id = self.dumplicate(_product)
+                if _flag:
+                    _status = randint(201,300)
+                    save_product_tmp.setValue(DOCUMENT_PRODUCT_TMP_NEW_ID,new_id,True)
+
+                    _product.update_row(self.ots_client)
+                else:
+                    _status = randint(451,500)
+                    save_product_tmp.setValue(DOCUMENT_PRODUCT_DUMP_ID,str(dump_id),True)
 
         else:
             _status = randint(401,450)
@@ -707,11 +751,11 @@ class Product_Manager(Product_Dict_Manager):
     def get_value_count(self,name,brand,specs,unit_price,quantity):
 
         value_count = 0
-        if len(name)>0:
+        if name is not None and len(name)>0:
             value_count += 1
-        if len(brand)>0:
+        if brand is not None and len(brand)>0:
             value_count += 1
-        if len(specs)>0:
+        if specs is not None and len(specs)>0:
             value_count += 1
         if isinstance(unit_price,(float,int)) and unit_price>0:
             value_count += 1
@@ -731,7 +775,8 @@ class Product_Manager(Product_Dict_Manager):
         tenderee = str(document_product.getProperties().get(DOCUMENT_PRODUCT_TENDEREE,""))
         supplier = str(document_product.getProperties().get(DOCUMENT_PRODUCT_SUPPLIER,""))
 
-
+        base_value_count = self.get_value_count(name,brand,specs,unit_price,quantity)
+        list_dump_id = []
         page_time_before = page_time
         page_time_after = page_time
         try:
@@ -740,6 +785,8 @@ class Product_Manager(Product_Dict_Manager):
         except Exception as e:
             pass
 
+        to_save = 1
+
         if len(name)>0 and len(brand)>0 and len(specs)>0 and isinstance(unit_price,(float,int)) and isinstance(quantity,(float,int)):
             bool_query = BoolQuery(must_queries=[TermQuery("name",name),
                                                  RangeQuery("page_time",page_time_before,page_time_after,True,True),
@@ -754,7 +801,64 @@ class Product_Manager(Product_Dict_Manager):
                                                                                 columns_to_get=ColumnsToGet(["name",'brand','specs'],return_type=ColumnReturnType.SPECIFIED))
             list_data = getRow_ots(rows)
             if len(list_data)>0:
-                return list_data[0].get(DOCUMENT_PRODUCT_ID),1
+                return list_data[0].get(DOCUMENT_PRODUCT_ID),0
+
+        bool_query = BoolQuery(must_queries=[
+            TermQuery(project_docids,str(docid)),
+        ])
+        rows,next_token,total_count,is_all_succeed = self.ots_client.search("project2","project2_index",
+                                                                            SearchQuery(bool_query,limit=10),
+                                                                            ColumnsToGet([project_docids],return_type=ColumnReturnType.SPECIFIED))
+
+        list_data = getRow_ots(rows)
+        set_docid = set()
+        for _data in list_data:
+            _docids = _data.get(project_docids,"")
+            for d_id in _docids.split(","):
+                d_id = d_id.strip()
+                if d_id!="":
+                    set_docid.add(int(d_id))
+        if docid in set_docid:
+            set_docid.remove(docid)
+        should_q = [TermQuery(DOCUMENT_PRODUCT_DOCID,did) for did in set_docid]
+        if len(should_q)>0:
+            bool_query = BoolQuery(must_queries=[TermQuery("name",name),
+                                                 BoolQuery(should_queries=should_q),
+                                                 ])
+            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                SearchQuery(bool_query,limit=50),
+                                                                                columns_to_get=ColumnsToGet(["docid",'name','brand','specs','unit_price','quantity'],return_type=ColumnReturnType.SPECIFIED))
+            list_data = getRow_ots(rows)
+            dict_docid_name = {}
+            match_ids = []
+            for _data in list_data:
+                docid1 = _data.get(DOCUMENT_PRODUCT_DOCID)
+                name1 = _data.get(DOCUMENT_PRODUCT_NAME)
+                brand1 = _data.get(DOCUMENT_PRODUCT_BRAND)
+                specs1 = _data.get(DOCUMENT_PRODUCT_SPECS)
+                unit_price1 = _data.get(DOCUMENT_PRODUCT_UNIT_PRICE)
+                quantity1 = _data.get(DOCUMENT_PRODUCT_QUANTITY)
+                id = _data.get(DOCUMENT_PRODUCT_ID)
+                value_count1 = self.get_value_count(name1,brand1,specs1,unit_price1,quantity1)
+                if name1==name:
+                    match_ids.append({DOCUMENT_PRODUCT_ID:id,"value_count":value_count1})
+                    if docid1 not in dict_docid_name:
+                        dict_docid_name[docid1] = []
+                    dict_docid_name[docid1].append(name)
+            is_all_one = True
+            for k,v in dict_docid_name.items():
+                if len(v)!=1:
+                    is_all_one = False
+            if is_all_one:
+                match_ids.sort(key=lambda x:x.get("value_count",0),reverse=True)
+                if len(match_ids)>0:
+                    _id = match_ids[0].get(DOCUMENT_PRODUCT_ID)
+                    value_count1 = match_ids[0]["value_count"]
+                    if base_value_count<value_count1:
+                        to_save = 0
+                    for _match in match_ids:
+                        list_dump_id.append(_match.get(DOCUMENT_PRODUCT_ID))
+
 
         if len(name)>0 and len(brand)>0 and len(supplier)>0 and len(tenderee)>0:
             # log("docid %s name %s page_time_before %s page_time_after %s brand %s supplier %s tenderee %s"%(str(docid),name,page_time_before,page_time_after,brand,supplier,tenderee))
@@ -769,7 +873,7 @@ class Product_Manager(Product_Dict_Manager):
                                                                                 SearchQuery(bool_query,limit=50),
                                                                                 columns_to_get=ColumnsToGet(['name','brand','specs','unit_price','quantity'],return_type=ColumnReturnType.SPECIFIED))
             list_data = getRow_ots(rows)
-            value_count = self.get_value_count(name,brand,specs,unit_price,quantity)
+
 
             for _d in list_data:
                 s_id = _d.get(DOCUMENT_PRODUCT_ID)
@@ -788,12 +892,10 @@ class Product_Manager(Product_Dict_Manager):
                     check_flag = False
 
                 if check_flag:
-                    if value_count<value_count1:
+                    if base_value_count<value_count1:
                         to_save = 0
-                    else:
-                        to_save = 1
-                    return s_id,to_save
-        return None,1
+                    list_dump_id.append(s_id)
+        return list_dump_id,to_save
 
 
     def dumplicate(self,document_product):
@@ -806,18 +908,27 @@ class Product_Manager(Product_Dict_Manager):
         dump_id,to_save = self.dumplicate_search_product(document_product)
 
         if dump_id is not None:
-            document_product.setValue(DOCUMENT_PRODUCT_DUMP_ID,dump_id,True)
+            document_product.setValue(DOCUMENT_PRODUCT_DUMP_ID,str(dump_id),True)
 
         if to_save==1:
             if dump_id is not None:
-                _d = {DOCUMENT_PRODUCT_ID:dump_id,
-                      DOCUMENT_PRODUCT_STATUS:randint(401,450),
-                      DOCUMENT_PRODUCT_DUMP_ID:document_product.getProperties().get(DOCUMENT_PRODUCT_ID)}
-                _dp = Document_product(_d)
-                _dp.update_row(self.ots_client)
-            return True
+                if isinstance(dump_id,str):
+                    _d = {DOCUMENT_PRODUCT_ID:dump_id,
+                          DOCUMENT_PRODUCT_STATUS:randint(401,450),
+                          DOCUMENT_PRODUCT_DUMP_ID:document_product.getProperties().get(DOCUMENT_PRODUCT_ID)}
+                    _dp = Document_product(_d)
+                    _dp.update_row(self.ots_client)
+                elif isinstance(dump_id,list):
+                    for d_id in dump_id:
+                        _d = {DOCUMENT_PRODUCT_ID:d_id,
+                              DOCUMENT_PRODUCT_STATUS:randint(401,450),
+                              DOCUMENT_PRODUCT_DUMP_ID:document_product.getProperties().get(DOCUMENT_PRODUCT_ID)}
+                        _dp = Document_product(_d)
+                        _dp.update_row(self.ots_client)
+
+            return True,dump_id
         else:
-            return False
+            return False,dump_id
 
     def start_processing(self):
         scheduler = BlockingScheduler()
@@ -863,18 +974,21 @@ def fix_product_data():
     '''
     table_name = "document_product_temp"
     table_index = "document_product_temp_index"
+    columns = [DOCUMENT_PRODUCT_TMP_NEW_ID,DOCUMENT_PRODUCT_TMP_STATUS]
+
+
+    table_name = Document_product_table_name
+    table_index = Document_product_table_name+"_index"
+    columns = [DOCUMENT_PRODUCT_ORIGINAL_ID]
 
-    # table_name = "document_product"
-    # table_index = "document_product_index"
 
-    columns = [DOCUMENT_PRODUCT_TMP_NEW_ID,DOCUMENT_PRODUCT_TMP_STATUS]
     ots_client = getConnect_ots()
     bool_query = BoolQuery(should_queries=[
         # RangeQuery("status",501),
         # TermQuery("docid",246032980)
 
-        RangeQuery("status",201,301),
-        RangeQuery("status",401,451)
+        RangeQuery("status",401,501),
+        # RangeQuery("status",401,451)
         # WildcardQuery(DOCUMENT_PRODUCT_ORIGINAL_SPECS,"MFUSOne")
         # TermQuery(DOCUMENT_PRODUCT_SPECS,"MFUSOne")
     ])
@@ -884,6 +998,7 @@ def fix_product_data():
                                                                    columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
 
     list_rows = getRow_ots(rows)
+    print(total_count)
     while next_token:
         rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
                                                                        SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
@@ -932,13 +1047,18 @@ def fix_product_data():
     def deleteAndReprocess(item,result_queue):
 
         original_id = item.get(DOCUMENT_PRODUCT_TMP_ID)
+        new_id = item.get(DOCUMENT_PRODUCT_TMP_NEW_ID)
+
+        original_id = item.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+        new_id = item.get(DOCUMENT_PRODUCT_ID)
+
         print("original_id",original_id,"id",item.get(DOCUMENT_PRODUCT_ID))
         # delete data and rerun
         _d = {DOCUMENT_PRODUCT_TMP_ID:original_id,DOCUMENT_PRODUCT_TMP_STATUS:1}
         dpt = Document_product_tmp(_d)
         dpt.update_row(ots_client)
 
-        new_id = item.get(DOCUMENT_PRODUCT_TMP_NEW_ID)
+
         if new_id is not None and new_id!="":
             _d = {DOCUMENT_PRODUCT_ID:new_id}
             dp = Document_product(_d)
@@ -1006,19 +1126,19 @@ def test_check_brand():
             f.write(b+"\n")
 
 def test_match():
-    a = "兽医设备"
-
+    a = "桂林市啄木鸟医疗器械有限公司"
 
     # vector = request_embedding(get_milvus_standard_name(a))
-    vector = [get_embedding_request(b) for b in a]
+    # vector = [get_embedding_request(b) for b in a]
     pm = Product_Manager()
-    _GRADE = NAME_GRADE
+    _GRADE = BRAND_GRADE
     Coll,_ = pm.get_collection(_GRADE)
     print(Coll.name)
 
     output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]
     # start_time = time.time()
-    # print(Coll.query(expr=" ots_id in ['75058b275a4c1d8ee38b58c5c5cce3bb'] ",output_fields=output_fields))
+    _id = get_milvus_product_dict_id(a)
+    print(Coll.query(expr=" ots_id in ['%s'] "%(_id),output_fields=output_fields))
     # print("cost",time.time()-start_time)
     # print(Coll.compact())
     # result = search_embedding(Coll,embedding_index_name,[vector],pm.search_params,output_fields,limit=20)
@@ -1039,7 +1159,8 @@ def test_match():
         ots_name = _search.get("ots_name")
         standard_name = _search.get("standard_name")
         ots_parent_id = _search.get("ots_parent_id")
-        if is_similar(a,ots_name) or check_product(a,ots_name):
+        remove_words = _search.get("remove_words")
+        if check_brand(a,ots_name,remove_words):
             print("similar",a,ots_name)
         else:
             print("not similar",a,ots_name)
@@ -1218,7 +1339,7 @@ def delete_brands():
 
 
 def delete_specs():
-    filename = os.path.join(current_path,"search_similar2_1.xlsx_specs_move.txt")
+    filename = os.path.join(current_path,"illegal_specs.txt")
 
     ots_client = getConnect_ots()
     list_brand = []
@@ -1314,9 +1435,9 @@ def update_document_product_dict():
 def test():
     # pm = Product_Manager()
     # pm.test()
-    fix_product_data()
+    # fix_product_data()
     # test_check_brand()
-    # test_match()
+    test_match()
     # rebuild_milvus()
 
     # move_document_product()
@@ -1325,9 +1446,40 @@ def test():
     # remove_redis_keys()
     # update_document_product_dict()
 
+def clean_product_dict_interface():
+    ots_client = getConnect_ots()
+    bool_query = BoolQuery(must_queries=[
+        BoolQuery(should_queries=[
+            TermQuery("action","insert"),
+            TermQuery("action","base")
+        ])
+    ])
+    task_queue = Queue()
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),get_total_count=True,limit=100),
+                                                                   columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+    list_data = getRow_ots(rows)
+    for _data in list_data:
+        task_queue.put(_data)
+    print("%d/%d"%(task_queue.qsize(),total_count))
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
+                                                                       columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+        list_data = getRow_ots(rows)
+        for _data in list_data:
+            task_queue.put(_data)
+        print("%d/%d"%(task_queue.qsize(),total_count))
+
+    def _handle(item,result_queue):
+        _dpd = Document_product_dict_interface(item)
+        _dpd.delete_row(ots_client)
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
 
 if __name__ == '__main__':
 
+    # test()
     # start_process_product()
     # print(getMD5('11936c56f2dd1426764e317ca2e8e1a7'+'&&鱼跃'))
     # print(Product_Manager.get_bid_filemd5s(155415770,getConnect_ots()))
@@ -1335,4 +1487,4 @@ if __name__ == '__main__':
     # ots_name = "一氧化碳分析仪"
     # print(is_similar(name,ots_name),check_product(name,ots_name))
     # print(is_legal_specs('SCM-A/SB(0.18D)'))
-    test()
+    clean_product_dict_interface()

+ 29 - 0
BaseDataMaintenance/maxcompute/attachmentRec.py

@@ -84,6 +84,33 @@ class f_getPlatform(object):
         return getPlatform()
 
 
+import hashlib
+def getMD5(_text):
+    if _text is not None and len(_text)>0:
+        if isinstance(_text,str):
+            bs = _text.encode()
+        elif isinstance(_text,bytes):
+            bs = _text
+        else:
+            return ""
+        md5 = hashlib.md5()
+        md5.update(bs)
+        return md5.hexdigest()
+    return ""
+MAX_NAME_LENGTH = 300
+def get_milvus_standard_name(name):
+    return "%s"%(str(name)[:MAX_NAME_LENGTH].lower())
+
+def get_milvus_product_dict_id(name):
+    return getMD5(get_milvus_standard_name(name))
+
+@annotate('->string')
+class f_getMD5(object):
+
+    def evaluate(self,name):
+        return get_milvus_product_dict_id(name)
+
+
 @annotate('string->string,string,bigint')
 class f_strip_filemd5(BaseUDTF):
 
@@ -97,6 +124,8 @@ class f_strip_filemd5(BaseUDTF):
 
         self.forward(filemd5,filemd5_strip,parts)
 
+
+
 @annotate('string,bigint->string')
 class f_group_filemd5(BaseUDAF):
 

+ 12 - 7
BaseDataMaintenance/model/ots/document_product_dict_interface.py

@@ -15,10 +15,20 @@ DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS = "standard_alias"
 DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR = "|"
 DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION = "action" #insert delete update
 
+DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_BASE = "base"
+DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE = "delete"
+DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_UPDATE = "update"
+DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT = "insert"
+
+DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS = "remove_words"
+DOCUMENT_PRODUCT_DICT_INTERFACE_LEVEL = "level"
+
 MAX_NAME_LENGTH = 300
 
 Document_product_dict_interface_table_name = "document_product_dict_interface"
 
+
+
 class Document_product_dict_interface(BaseModel):
 
     def __init__(self,_dict):
@@ -31,11 +41,6 @@ class Document_product_dict_interface(BaseModel):
         return ["id"]
 
 
-
-
 from BaseDataMaintenance.common.documentFingerprint import getMD5
-def get_document_product_dict_id(parent_md5,name):
-    return getMD5(parent_md5+"&&%s"%name)
-
-def get_document_product_dict_standard_alias_id(name):
-    return getMD5("alias&&%s"%name)
+def get_document_product_dict_interface_base_id(name):
+    return "mdd5="+getMD5(name)