Bläddra i källkod

调整匹配规则和搜索规则

luojiehua 1 år sedan
förälder
incheckning
e40cb40b61

+ 1 - 0
.gitignore

@@ -4,3 +4,4 @@
 /.idea/
 /attachmentProcessTime2.xlsx
 /BaseDataMaintenance/maintenance/attachment/2022-01-18_183521_export11.xlsx
+/BaseDataMaintenance/test/

+ 7 - 3
BaseDataMaintenance/maintenance/dataflow.py

@@ -2796,7 +2796,7 @@ class Dataflow_dumplicate(Dataflow):
 
 
     def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
-        def producer(columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status]):
+        def producer(columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document]):
             q_size = self.queue_dumplicate.qsize()
             log("dumplicate queue size %d"%(q_size))
             if q_size>flow_process_count//3:
@@ -2806,7 +2806,7 @@ class Dataflow_dumplicate(Dataflow):
                 # TermQuery("docid",271983871)
             ])
             rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
-                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
+                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(document_update_document,SortOrder.DESC),FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
                                                                                 ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
             log("flow_dumplicate producer total_count:%d"%total_count)
             list_dict = getRow_ots(rows)
@@ -3861,7 +3861,7 @@ class Dataflow_dumplicate(Dataflow):
             set_docid = set()
 
             list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,get_all=get_all,to_log=False)
-            print("len_rules",len(list_rules),table_name,table_index)
+            # print("len_rules",len(list_rules),table_name,table_index)
             list_rules.sort(key=lambda x:x["confidence"],reverse=True)
             _i = 0
             step = 5
@@ -3914,6 +3914,7 @@ class Dataflow_dumplicate(Dataflow):
 
             remove_list = []
 
+
             if len(final_list)==0 or best_docid==item.get(document_tmp_docid):
                 dtmp.setValue(document_tmp_save,1,True)
                 # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
@@ -3940,6 +3941,9 @@ class Dataflow_dumplicate(Dataflow):
             list_docids.append(best_docid)
             b_log = False if upgrade else True
 
+            if item.get(document_update_document)=="true":
+                dtmp.setValue(document_tmp_save,1,True)
+
             if exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0:
                 log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
                 dtmp.setValue(document_tmp_projects,"[]",True)

+ 2 - 0
BaseDataMaintenance/maintenance/product/1.py

@@ -72,6 +72,8 @@ for b in a.split("\n"):
      list_c.append(d)
 print(",".join(list_c))
 
+print("BENEHEARTD6".lower()=="BeneHeartD6".lower())
+
 
 
 

+ 70 - 9
BaseDataMaintenance/maintenance/product/productUtils.py

@@ -26,6 +26,34 @@ from BaseDataMaintenance.maintenance.product.make_brand_pattern import get_area_
 
 area_set = get_area_set()
 
+
+def get_intellect_search(coll,index_name,name,grade,search_params,output_fields,limit,max_steps=5):
+
+    vector = []
+    v = get_embedding_request(name)
+    if v is not None:
+        vector.append(v)
+    if len(str(name))>=5:
+        strides = [3,6]
+        for stride in strides:
+            steps = len(name)//stride
+            if len(name)%stride>=stride//2+1:
+                steps += 1
+            _begin = 0
+            for i in range(min(steps,max_steps)):
+                _name = str(name)[i*stride:(i+1)*stride+2]
+                v = get_embedding_request(_name)
+                if v is not None:
+                    vector.append(v)
+
+    if len(vector)>0:
+        list_search = get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit)
+        if list_search:
+            return list_search
+
+    return []
+
+
 def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit=3):
 
     if name is None or name=="":
@@ -39,8 +67,10 @@ def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_
         except Exception as e:
             log("get redis data error")
         if _search_list is not None:
+            log("_search_list is not None")
             return json.loads(_search_list)
         else:
+            log("search from milvus")
             list_result = []
             result = coll.search(vector,index_name,search_params,top_k=limit,output_fields=output_fields,limit=limit)
             for hits in result:
@@ -85,7 +115,7 @@ def get_embedding_request(sentence,retry_times=3):
     db = redis.Redis(connection_pool=pool_product)
 
     try:
-        _md5 = getMD5(str(sentence))+"_embedding"
+        _md5 = getMD5(get_milvus_standard_name(sentence))+"_embedding"
         _embedding = None
         try:
             _embedding = db.get(_md5)
@@ -181,12 +211,15 @@ def is_similar(source,target,_radio=None):
     #判断相似度
     similar = fuzz.ratio(source,target)
     if similar>=min_ratio:
+        log("%s and %s similar_jaro %d"%(source,target,similar))
         return True
     similar_jaro = Levenshtein.jaro(source,target)
     if similar_jaro*100>=min_ratio:
+        log("%s and %s similar_jaro %d"%(source,target,similar_jaro*100))
         return True
     similar_jarow = Levenshtein.jaro_winkler(source,target)
     if similar_jarow*100>=min_ratio:
+        log("%s and %s similar_jaro %d"%(source,target,similar_jarow*100))
         return True
 
     if min_len>=5:
@@ -211,6 +244,8 @@ def is_contain(source,target,min_len=2):
         return True
     return False
 
+
+
 def check_product(source,target):
     if is_contain(source,target,min_len=3):
         return True
@@ -218,11 +253,36 @@ def check_product(source,target):
 
 
 def check_brand(source,target):
-    source = str(source).lower()
-    target = str(target).lower()
+    source = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(source).lower())
+    target = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(target).lower())
 
-    if is_contain(source,target):
-        return True
+    max_len = max(len(source),len(target))
+    min_len = min(len(source),len(target))
+
+    min_ratio = 92
+    if max_len<2:
+        return False
+    elif max_len<=5:
+        min_ratio=94
+    else:
+        min_ratio = 90
+
+    source_c = "".join(get_chinese_string(source))
+    target_c = "".join(get_chinese_string(target))
+    print(source_c,target_c)
+    if len(source_c)>=2 and len(target_c)>=2:
+        if not(source_c in area_set or target_c in area_set):
+            if is_similar(source_c,target_c,min_ratio):
+                return True
+
+            if is_contain(source_c,target_c):
+                return True
+    if has_same_specs_count(source,target):
+        if is_similar(source,target,min_ratio):
+            return True
+
+        if is_contain(source,target):
+            return True
 
 SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789.']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
 NOT_SPECS_PATTERN = re.compile("[^%s]"%("".join(list(SPECS_CHECK_SET))))
@@ -246,7 +306,7 @@ def has_same_specs_count(source, target):
                 dict_target[s] = 0
             dict_target[s] += 1
     union_keys = set(list(dict_source.keys())) & set(list(dict_target.keys()))
-    if len(dict_source.keys())!= len(union_keys):
+    if len(dict_source.keys())!= len(union_keys) or len(dict_target.keys())!= len(union_keys):
         return False
     for k,v in dict_source.items():
         if v!=dict_target.get(k):
@@ -439,9 +499,10 @@ def clean_product_quantity(product_quantity):
     return ""
 
 if __name__ == '__main__':
-    print(is_similar('超声','超声炮',_radio=99))
+    print(check_brand('杭州郎基','杭州利华'))
     # print(re.split("[^\u4e00-\u9fff]",'128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10'))
     # import Levenshtein
     # print(Levenshtein.ratio('助听器','助行器'))
-    print(clean_product_specs("//4008SverssionV10"))
-    # print(is_legal_brand(getConnect_ots(),"康复"))
+    # print(clean_product_specs("//4008SverssionV10"))
+    # print(is_legal_brand(getConnect_ots(),"康复"))
+    # print(check_specs("500ml","3500ml"))

+ 108 - 18
BaseDataMaintenance/maintenance/product/product_dict.py

@@ -652,9 +652,9 @@ def search_similar():
 
     columns=[DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_PARENT_ID,DOCUMENT_PRODUCT_DICT_GRADE]
 
-    bool_query = BoolQuery(
-        must_queries=[RangeQuery(DOCUMENT_PRODUCT_DICT_GRADE,5,5,True,True)]
-    )
+    bool_query = BoolQuery(must_queries=[
+        RangeQuery(DOCUMENT_PRODUCT_DICT_GRADE,4,5,True,True),
+    ])
 
     rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
                                                                         SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED)]),limit=100,get_total_count=True),
@@ -671,8 +671,8 @@ def search_similar():
         list_dict = getRow_ots(rows)
         for _d in list_dict:
             list_data.append(_d)
-        if len(list_data)>=100000:
-            break
+        # if len(list_data)>=1000:
+        #     break
     log("product_dict embedding total_count:%d"%total_count)
     set_key = set()
     for _d in list_data:
@@ -681,7 +681,7 @@ def search_similar():
         _key = "%s-%d"%(name,grade)
         if _key in set_key:
             continue
-        set_key.add(set_key)
+        set_key.add(_key)
         task_queue.put(_d)
 
     result_queue = Queue()
@@ -697,19 +697,18 @@ def search_similar():
         if vector is not None and Coll is not None:
             search_list = get_embedding_search(Coll,embedding_index_name,name,grade,[vector],pdm.search_params,output_fields,limit=10)
             for _item in search_list:
-                ots_id = _item.get("id")
+                ots_id = _item.get("ots_id")
                 ots_name = _item.get("ots_name")
                 ots_parent_id = _item.get("ots_parent_id")
                 standard_name = _item.get("standard_name")
-                if name!=ots_name:
-                    if grade==4:
-                        if is_similar(name,ots_name) or check_brand(name,ots_name):
-                            _d = {"source_id":id,"source_name":name,"grade":grade,"target_id":ots_id,"target_name":ots_name,"parent_id":parent_id,"target_parent_id":ots_parent_id,"target_standard_name":standard_name}
-                            result_queue.put(_d)
-                    elif grade==5:
-                        if is_similar(name,ots_name) and check_specs(name,ots_name):
-                            _d = {"source_id":id,"source_name":name,"grade":grade,"target_id":ots_id,"target_name":ots_name,"parent_id":parent_id,"target_parent_id":ots_parent_id,"target_standard_name":standard_name}
-                            result_queue.put(_d)
+                if grade==4:
+                    if check_brand(name,ots_name):
+                        _d = {"source_id":id,"source_name":name,"grade":grade,"target_id":ots_id,"target_name":ots_name,"parent_id":parent_id,"target_parent_id":ots_parent_id,"target_standard_name":standard_name}
+                        result_queue.put(_d)
+                elif grade==5:
+                    if is_similar(name,ots_name) and check_specs(name,ots_name):
+                        _d = {"source_id":id,"source_name":name,"grade":grade,"target_id":ots_id,"target_name":ots_name,"parent_id":parent_id,"target_parent_id":ots_parent_id,"target_standard_name":standard_name}
+                        result_queue.put(_d)
 
 
     mt = MultiThreadHandler(task_queue,handle,result_queue,5,1)
@@ -734,7 +733,97 @@ def search_similar():
             break
     import pandas as pd
     df = pd.DataFrame(df_data)
-    df.to_excel("search_similar1.xlsx",columns=df_columns)
+    df.to_excel("search_similar2.xlsx",columns=df_columns)
+
+
+def clean_similar():
+    import pandas as pd
+    filename = "../../test/search_similar2_1.xlsx"
+
+    df = pd.read_excel(filename)
+    _set = set()
+    list_source_name = []
+    list_grade = []
+    list_target_name = []
+    list_check = []
+
+    brand_set_move = set()
+    brand_set_keep = set()
+    specs_set_move = set()
+    specs_set_keep = set()
+    for source_name,grade,target_name in zip(df["source_name"],df["grade"],df["target_name"]):
+        source_name = str(source_name)
+        target_name = str(target_name)
+        if source_name==target_name:
+            continue
+        _key1 = "%s-%s"%(source_name,target_name)
+        _key2 = "%s--%s"%(target_name,source_name)
+        if _key1 in _set or _key2 in _set:
+            continue
+        _set.add(_key1)
+        _set.add(_key2)
+        list_source_name.append(source_name)
+        list_grade.append(grade)
+        list_target_name.append(target_name)
+        if grade==4:
+            _check = check_brand(source_name,target_name)
+        elif grade==5:
+            _check = is_similar(source_name,target_name) and check_specs(source_name,target_name)
+        list_check.append(_check)
+        if _check:
+            if grade==4:
+                n_source_name = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(source_name))
+                n_target_name = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(target_name))
+            else:
+                n_source_name = source_name
+                n_target_name = target_name
+            source_dis = abs(len(n_source_name)-4.6)
+            target_dis = abs(len(n_target_name)-4.6)
+            if source_dis>target_dis:
+                if grade==4:
+                    brand_set_keep.add(target_name)
+                    brand_set_move.add(source_name)
+                elif grade==5:
+                    specs_set_keep.add(target_name)
+                    specs_set_move.add(source_name)
+            else:
+                if grade==4:
+                    brand_set_keep.add(source_name)
+                    brand_set_move.add(target_name)
+                elif grade==5:
+                    specs_set_keep.add(source_name)
+                    specs_set_move.add(target_name)
+    df = pd.DataFrame({"source_name":list_source_name,
+                       "grade":list_grade,
+                       "target_name":list_target_name,
+                       "check":list_check})
+    df.to_excel("%s_clean.xlsx"%(filename),columns=["source_name","grade","target_name","check"])
+    list_brand_move = list(brand_set_move)
+    list_brand_keep = list(brand_set_keep)
+    list_brand_union = list(brand_set_move&brand_set_keep)
+    list_specs_move = list(specs_set_move)
+    list_specs_keep = list(specs_set_keep)
+    list_specs_union = list(specs_set_move&specs_set_keep)
+    with open("%s_brand_move.txt"%(filename),"w",encoding="utf8") as f:
+        for _move in list_brand_move:
+            f.write("%s\n"%(_move))
+    with open("%s_brand_keep.txt"%(filename),"w",encoding="utf8") as f:
+        for _keep in list_brand_keep:
+            f.write("%s\n"%(_keep))
+    with open("%s_brand_union.txt"%(filename),"w",encoding="utf8") as f:
+        for _union in list_brand_union:
+            f.write("%s\n"%(_union))
+
+    with open("%s_specs_move.txt"%(filename),"w",encoding="utf8") as f:
+        for _move in list_specs_move:
+            f.write("%s\n"%(_move))
+    with open("%s_specs_keep.txt"%(filename),"w",encoding="utf8") as f:
+        for _keep in list_specs_keep:
+            f.write("%s\n"%(_keep))
+    with open("%s_specs_union.txt"%(filename),"w",encoding="utf8") as f:
+        for _union in list_specs_union:
+            f.write("%s\n"%(_union))
+
 
 
 def insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias):
@@ -830,4 +919,5 @@ def interface_deletes():
 
 if __name__ == '__main__':
     # start_embedding_product_dict()
-    interface_deletes()
+    # interface_deletes()
+    clean_similar()

+ 269 - 143
BaseDataMaintenance/maintenance/product/products.py

@@ -156,11 +156,31 @@ class Product_Manager(Product_Dict_Manager):
         brand_ots_id = None
         specs_ots_id = None
         if name is not None and name!="":
-            name_vector = get_embedding_request(name)
-            if name_vector is not None:
+            Coll,_ = self.get_collection(NAME_GRADE)
+
+            search_list = get_intellect_search(Coll,embedding_index_name,name,NAME_GRADE,self.search_params,output_fields,limit=10)
+
+            for _search in search_list:
+                ots_id = _search.get("standard_name_id")
+                ots_name = _search.get("ots_name")
+                standard_name = _search.get("standard_name")
+                ots_parent_id = _search.get("ots_parent_id")
+
+                if is_similar(name,ots_name) or check_product(name,ots_name):
+                    name_ots_id = ots_id
+                    new_name = standard_name
+
+                    log("checking name %s succeed %s"%(name,ots_name))
+                    # #update alias of name
+                    # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
+                    # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
+                    # if _flag and _dpd.updateAlias(name):
+                    #     _dpd.update_row(self.ots_client)
+                    break
+        if name_ots_id is None:
+            for name in list_candidates:
                 Coll,_ = self.get_collection(NAME_GRADE)
-
-                search_list = get_embedding_search(Coll,embedding_index_name,name,NAME_GRADE,[name_vector],self.search_params,output_fields,limit=20)
+                search_list = get_intellect_search(Coll,embedding_index_name,name,NAME_GRADE,self.search_params,output_fields,limit=10)
 
                 for _search in search_list:
                     ots_id = _search.get("standard_name_id")
@@ -168,42 +188,18 @@ class Product_Manager(Product_Dict_Manager):
                     standard_name = _search.get("standard_name")
                     ots_parent_id = _search.get("ots_parent_id")
 
-                    if is_similar(name,ots_name) or check_product(name,ots_name):
+                    if is_similar(name,ots_name,_radio=95):
+
+                        log("checking name %s succeed %s"%(name,ots_name))
                         name_ots_id = ots_id
                         new_name = standard_name
 
-                        log("checking name %s succeed %s"%(name,ots_name))
                         # #update alias of name
                         # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
                         # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
                         # if _flag and _dpd.updateAlias(name):
                         #     _dpd.update_row(self.ots_client)
                         break
-        if name_ots_id is None:
-            for name in list_candidates:
-                name_vector = get_embedding_request(name)
-                if name_vector is not None:
-                    Coll,_ = self.get_collection(NAME_GRADE)
-                    search_list = get_embedding_search(Coll,embedding_index_name,name,NAME_GRADE,[name_vector],self.search_params,output_fields,limit=10)
-
-                    for _search in search_list:
-                        ots_id = _search.get("standard_name_id")
-                        ots_name = _search.get("ots_name")
-                        standard_name = _search.get("standard_name")
-                        ots_parent_id = _search.get("ots_parent_id")
-
-                        if is_similar(name,ots_name,_radio=95):
-
-                            log("checking name %s succeed %s"%(name,ots_name))
-                            name_ots_id = ots_id
-                            new_name = standard_name
-
-                            # #update alias of name
-                            # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
-                            # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
-                            # if _flag and _dpd.updateAlias(name):
-                            #     _dpd.update_row(self.ots_client)
-                            break
         if name_ots_id is not None:
 
             if brand is not None and brand!="":
@@ -214,14 +210,104 @@ class Product_Manager(Product_Dict_Manager):
                 brand_ch = get_chinese_string(brand)
                 l_brand.extend(brand_ch)
 
+                Coll,_ = self.get_collection(BRAND_GRADE)
+
                 _find = False
                 for brand in l_brand:
+                    search_list = get_intellect_search(Coll,embedding_index_name,brand,BRAND_GRADE,self.search_params,output_fields,limit=10)
 
-                    brand_vector = get_embedding_request(brand)
-                    if brand_vector is not None:
-                        Coll,_ = self.get_collection(BRAND_GRADE)
-                        search_list = get_embedding_search(Coll,embedding_index_name,brand,BRAND_GRADE,[brand_vector],self.search_params,output_fields,limit=20)
+                    # log("search brand %s"%(brand))
+                    for _search in search_list:
 
+                        ots_id = _search.get("standard_name_id")
+                        ots_name = _search.get("ots_name")
+                        standard_name = _search.get("standard_name")
+                        ots_parent_id = _search.get("ots_parent_id")
+
+                        # log("check brand %s and %s"%(brand,ots_name))
+                        if is_similar(brand,ots_name) or check_brand(brand,ots_name):
+
+                            # log("check brand similar succeed:%s and %s"%(brand,ots_name))
+
+                            if ots_name==new_name:
+                                continue
+                            new_brand = standard_name
+
+                            log("checking brand %s succeed %s"%(brand,new_brand))
+                            # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
+
+                            if name_ots_id is not None:
+                                brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
+
+                                _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
+                                            DOCUMENT_PRODUCT_DICT_NAME:new_brand,
+                                            DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
+                                            DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
+                                            DOCUMENT_PRODUCT_DICT_STATUS:1,
+                                            DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
+                                            DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
+                                            DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                            DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                            }
+                                _dpd_brand = Document_product_dict(_d_brand)
+                                # _dpd_brand.updateAlias(str(new_brand).lower())
+                                if not _dpd_brand.exists_row(self.ots_client):
+                                    _dpd_brand.update_row(self.ots_client)
+
+                                else:
+                                    pass
+                                    # #update alias
+                                    # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
+                                    # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
+                                    # if _flag:
+                                    #     if _dpd.updateAlias(brand):
+                                    #         _dpd.update_row(self.ots_client)
+
+                            _find = True
+                            break
+                        else:
+                            # log("check brand similar failed:%s and %s"%(brand,ots_name))
+                            # add new brand?
+                            pass
+                    if _find:
+                        break
+                if not _find:
+                    for brand in l_brand:
+                        if self.check_new_brand(brand):
+                            new_brand = clean_product_brand(brand)
+                            if new_brand=="":
+                                continue
+                            log("adding new brand %s"%(str(new_brand)))
+                            _d_brand = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
+                                        DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_brand,
+                                        DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(str(brand).lower()),
+                                        DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:BRAND_GRADE,
+                                        DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
+                                        DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:name_ots_id,
+                                        DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                        DOCUMENT_PRODUCT_DICT_INTERFACE_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                        DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"insert"
+                                        }
+                            dpdi = Document_product_dict_interface(_d_brand)
+                            dpdi.update_row(self.ots_client)
+                            break
+
+            if brand_ots_id is None:
+                _find = False
+                Coll,_ = self.get_collection(BRAND_GRADE)
+                for brand in list_candidates:
+                    if _find:
+                        break
+                    l_brand = [brand]
+                    l_brand.append(clean_product_brand(brand))
+                    brand_ch = get_chinese_string(brand)
+                    l_brand.extend(brand_ch)
+
+                    for brand in l_brand:
+                        if _find:
+                            break
+
+                        search_list = get_intellect_search(Coll,embedding_index_name,brand,BRAND_GRADE,self.search_params,output_fields,limit=10)
                         # log("search brand %s"%(brand))
                         for _search in search_list:
 
@@ -231,10 +317,8 @@ class Product_Manager(Product_Dict_Manager):
                             ots_parent_id = _search.get("ots_parent_id")
 
                             # log("check brand %s and %s"%(brand,ots_name))
-                            if is_similar(brand,ots_name) or check_brand(brand,ots_name):
-
+                            if check_brand(brand,ots_name):
                                 # log("check brand similar succeed:%s and %s"%(brand,ots_name))
-
                                 if ots_name==new_name:
                                     continue
                                 new_brand = standard_name
@@ -271,104 +355,6 @@ class Product_Manager(Product_Dict_Manager):
 
                                 _find = True
                                 break
-                            else:
-                                # log("check brand similar failed:%s and %s"%(brand,ots_name))
-                                # add new brand?
-                                pass
-                        if _find:
-                            break
-                if not _find:
-                    for brand in l_brand:
-                        if self.check_new_brand(brand):
-                            new_brand = clean_product_brand(brand)
-                            if new_brand=="":
-                                continue
-                            log("adding new brand %s"%(str(new_brand)))
-                            _d_brand = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
-                                        DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_brand,
-                                        DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(str(brand).lower()),
-                                        DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:BRAND_GRADE,
-                                        DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
-                                        DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:name_ots_id,
-                                        DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                                        DOCUMENT_PRODUCT_DICT_INTERFACE_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                                        DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"insert"
-                                        }
-                            dpdi = Document_product_dict_interface(_d_brand)
-                            dpdi.update_row(self.ots_client)
-                            break
-
-            if brand_ots_id is None:
-                _find = False
-                for brand in list_candidates:
-                    if _find:
-                        break
-                    l_brand = [brand]
-                    l_brand.append(clean_product_brand(brand))
-                    brand_ch = get_chinese_string(brand)
-                    l_brand.extend(brand_ch)
-
-                    for brand in l_brand:
-                        if _find:
-                            break
-                        start_time = time.time()
-                        # brand_vector = request_embedding(brand)
-                        brand_vector = get_embedding_request(brand)
-                        debug("get embedding for brand %s takes %.4fs"%(brand,time.time()-start_time))
-                        if brand_vector is not None:
-                            Coll,_ = self.get_collection(BRAND_GRADE)
-                            start_time = time.time()
-                            # search_list = search_embedding(Coll,embedding_index_name,[brand_vector],self.search_params,output_fields,limit=10)
-                            search_list = get_embedding_search(Coll,embedding_index_name,brand,BRAND_GRADE,[brand_vector],self.search_params,output_fields,limit=10)
-                            debug("get search_list for brand %s takes %.4fs"%(brand,time.time()-start_time))
-                            # log("search brand %s"%(brand))
-                            for _search in search_list:
-
-
-                                ots_id = _search.get("standard_name_id")
-                                ots_name = _search.get("ots_name")
-                                standard_name = _search.get("standard_name")
-                                ots_parent_id = _search.get("ots_parent_id")
-
-                                # log("check brand %s and %s"%(brand,ots_name))
-                                if is_similar(brand,ots_name,_radio=95):
-                                    # log("check brand similar succeed:%s and %s"%(brand,ots_name))
-                                    if ots_name==new_name:
-                                        continue
-                                    new_brand = standard_name
-
-                                    log("checking brand %s succeed %s"%(brand,new_brand))
-                                    # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
-
-                                    if name_ots_id is not None:
-                                        brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
-
-                                        _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
-                                                    DOCUMENT_PRODUCT_DICT_NAME:new_brand,
-                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
-                                                    DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
-                                                    DOCUMENT_PRODUCT_DICT_STATUS:1,
-                                                    DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
-                                                    DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
-                                                    DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                                                    DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                                                    }
-                                        _dpd_brand = Document_product_dict(_d_brand)
-                                        # _dpd_brand.updateAlias(str(new_brand).lower())
-                                        if not _dpd_brand.exists_row(self.ots_client):
-                                            _dpd_brand.update_row(self.ots_client)
-
-                                        else:
-                                            pass
-                                            # #update alias
-                                            # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
-                                            # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
-                                            # if _flag:
-                                            #     if _dpd.updateAlias(brand):
-                                            #         _dpd.update_row(self.ots_client)
-
-                                    _find = True
-                                    break
 
             if specs is not None and specs!="":
 
@@ -1003,13 +989,36 @@ def test_check_brand():
             f.write(b+"\n")
 
 def test_match():
-    a = "-SL-10XL"
-    vector = request_embedding(get_milvus_standard_name(a))
+    a = "数字化医用X射线摄影系统(DR)"
+
+
+    # vector = request_embedding(get_milvus_standard_name(a))
+    vector = [get_embedding_request(b) for b in a]
     pm = Product_Manager()
-    Coll,_ = pm.get_collection(SPECS_GRADE)
+    _GRADE = NAME_GRADE
+    Coll,_ = pm.get_collection(_GRADE)
+    print(Coll.name)
+
     output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]
-    search_list = search_embedding(Coll,embedding_index_name,[vector],pm.search_params,output_fields,limit=20)
-    print(search_list)
+    # start_time = time.time()
+    # print(Coll.query(expr=" ots_id in ['75058b275a4c1d8ee38b58c5c5cce3bb'] ",output_fields=output_fields))
+    # print("cost",time.time()-start_time)
+    # print(Coll.compact())
+    # result = search_embedding(Coll,embedding_index_name,[vector],pm.search_params,output_fields,limit=20)
+    #
+    # final_list = []
+    # for _search in result:
+    #     _d = {}
+    #     for k in output_fields:
+    #         _d[k] = _search.entity.get(k)
+    #     final_list.append(_d)
+    # final_list = remove_repeat_item(final_list,k="ots_name")
+
+    start_time = time.time()
+    # final_list = get_embedding_search(Coll,embedding_index_name,a,_GRADE,vector,pm.search_params,output_fields,limit=5)
+    final_list = get_intellect_search(Coll,embedding_index_name,a,_GRADE,pm.search_params,output_fields,limit=10)
+    print("cost",time.time()-start_time)
+    print(final_list)
 
 
 def rebuild_milvus():
@@ -1148,6 +1157,121 @@ def move_document_product():
     mt = MultiThreadHandler(task_queue,_handle,None,30)
     mt.run()
 
+current_path = os.path.dirname(__file__)
+def delete_brands():
+    filename = os.path.join(current_path,"search_similar2_1.xlsx_brand_move.txt")
+
+    ots_client = getConnect_ots()
+    list_brand = []
+    with open(filename,"r",encoding="utf8") as f:
+        while 1:
+            brand = f.readline()
+            if not brand:
+                break
+            brand = brand.strip()
+            list_brand.append(brand)
+
+    pm = Product_Manager()
+    Coll,_ = pm.get_collection(BRAND_GRADE)
+
+    print(Coll.name)
+    Coll.compact()
+    _count = 0
+
+    task_queue = Queue()
+    for brand in list_brand:
+        _count += 1
+        task_queue.put(brand)
+        # if _count>=2:
+        #     break
+
+    def _handle(brand,result_queue):
+
+        bool_query = BoolQuery(must_queries=[
+            TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,BRAND_GRADE),
+            TermQuery(DOCUMENT_PRODUCT_DICT_NAME,brand)
+        ])
+
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                       ColumnsToGet(return_type=ColumnReturnType.NONE))
+        list_data = getRow_ots(rows)
+        _id = get_milvus_product_dict_id(brand)
+
+        while next_token:
+            rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                           ColumnsToGet(return_type=ColumnReturnType.NONE))
+            list_data.extend(getRow_ots(rows))
+        for _d in list_data:
+            dpd = Document_product_dict(_d)
+            dpd.delete_row(ots_client)
+        # print(Coll.query(expr=" ots_id in ['%s']"%(_id),output_fields=["ots_id","ots_name"]))
+        delete_counts = Coll.delete(expr=" ots_id in ['%s']"%(_id)).delete_count
+
+        log("brand %s total_count %d md5:%s delete_counts:%d"%(brand,total_count,_id,delete_counts))
+
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+
+
+
+def delete_specs():
+    filename = os.path.join(current_path,"search_similar2_1.xlsx_specs_move.txt")
+
+    ots_client = getConnect_ots()
+    list_brand = []
+    with open(filename,"r",encoding="utf8") as f:
+        while 1:
+            brand = f.readline()
+            if not brand:
+                break
+            brand = brand.strip()
+            list_brand.append(brand)
+
+    pm = Product_Manager()
+    Coll,_ = pm.get_collection(SPECS_GRADE)
+    print(Coll.name)
+    Coll.compact()
+
+    _count = 0
+    task_queue = Queue()
+
+    for specs in list_brand:
+        task_queue.put(specs)
+        _count += 1
+        # if _count>=2:
+        #     break
+
+    def _handle(specs,result_queue):
+
+        bool_query = BoolQuery(must_queries=[
+            TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,SPECS_GRADE),
+            TermQuery(DOCUMENT_PRODUCT_DICT_NAME,specs)
+        ])
+
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                       ColumnsToGet(return_type=ColumnReturnType.NONE))
+        list_data = getRow_ots(rows)
+        _id = get_milvus_product_dict_id(specs)
+
+        while next_token:
+            rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                           ColumnsToGet(return_type=ColumnReturnType.NONE))
+            list_data.extend(getRow_ots(rows))
+        for _d in list_data:
+            dpd = Document_product_dict(_d)
+            dpd.delete_row(ots_client)
+        # print(Coll.query(expr=" ots_id in ['%s']"%(_id),output_fields=["ots_id","ots_name"]))
+        delete_counts = Coll.delete(expr=" ots_id in ['%s']"%(_id)).delete_count
+
+        log("brand %s total_count %d md5:%s delete_counts:%d"%(specs,total_count,_id,delete_counts))
+
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+    Coll.compact()
 
 
 
@@ -1156,10 +1280,12 @@ def test():
     # pm.test()
     # fix_product_data()
     # test_check_brand()
-    # test_match()
+    test_match()
     # rebuild_milvus()
 
-    move_document_product()
+    # move_document_product()
+    # delete_brands()
+    # delete_specs()
 
 if __name__ == '__main__':
 

+ 2 - 0
BaseDataMaintenance/model/ots/document.py

@@ -70,6 +70,8 @@ document_nlp_enterprise_attachment = "nlp_enterprise_attachment"
 
 
 document_total_tenderee_money = "total_tenderee_money"
+
+document_update_document = "update_document"
 class Document(BaseModel):
 
     def __init__(self,_dict):