1 rok pred · 14b9e7ca57
--- a/.idea/encodings.xml
+++ b/.idea/encodings.xml
@@ -4,6 +4,9 @@
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/attachmentProcessTime.xlsx" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/dataSource/searchPaddle.py" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/attachment/2022-01-18_183521_export11.xlsx" charset="GBK" />
			
 
				+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/select_product_exclude_name_from_tw_prod.csv" charset="GBK" />
			
 
				+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/select_product_product_name_exclude_name.csv" charset="GBK" />
			
 
				+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/update_product.csv" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/model/ots/2022-01-19_214304_export11.xlsx" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/model/ots/2022-10-14_190838_数据导出.xlsx" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/attachmentProcessTime2.xlsx" charset="GBK" />
			
--- a/BaseDataMaintenance/maintenance/dataflow.py
+++ b/BaseDataMaintenance/maintenance/dataflow.py
@@ -3888,15 +3888,15 @@ class Dataflow_dumplicate(Dataflow):
 
				 
			
 
				 
			
 
				             _time = time.time()
			
 
				-            log("%d start final check with length:%d"%(item["docid"],len(base_list)))
			
 
				+            # log("%d start final check with length:%d"%(item["docid"],len(base_list)))
			
 
				             final_list = self.dumplicate_fianl_check(base_list)
			
 
				 
			
 
				             exist_finterprint = self.is_exist_fingerprint(final_list,item.get(document_tmp_docid),item.get(document_tmp_fingerprint),table_name)
			
 
				-            log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
			
 
				+            # log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
			
 
				             best_docid = self.get_best_docid(final_list)
			
 
				 
			
 
				             final_list_docid = [a["docid"] for a in final_list]
			
 
				-            log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
			
 
				+            # log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
			
 
				             _d = {"partitionkey":item["partitionkey"],
			
 
				                   "docid":item["docid"],
			
 
				                   "status":random.randint(*flow_dumplicate_status_to),
			
--- a/BaseDataMaintenance/maintenance/dataflow_mq.py
+++ b/BaseDataMaintenance/maintenance/dataflow_mq.py
@@ -763,7 +763,12 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
				                 try:
			
 
				                     _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
			
 
				                     _soup = BeautifulSoup(_dochtmlcon,"lxml")
			
 
				-                    _soup = article_limit(_soup,50000)
			
 
				+                    if len(_dochtmlcon)>200000:
			
 
				+                        _find = _soup.find("div",attrs={"class":"richTextFetch"})
			
 
				+                        _find.decompose()
			
 
				+                    else:
			
 
				+                        _soup = BeautifulSoup(_dochtmlcon,"lxml")
			
 
				+                        _soup = article_limit(_soup,50000)
			
 
				                     _dochtmlcon = str(_soup)
			
 
				                 except Exception as e:
			
 
				                     traceback.print_exc()
			
--- a/BaseDataMaintenance/maintenance/product/productUtils.py
+++ b/BaseDataMaintenance/maintenance/product/productUtils.py
@@ -25,7 +25,7 @@ from BaseDataMaintenance.model.ots.enterprise import *
 
				 from BaseDataMaintenance.maintenance.product.make_brand_pattern import get_area_set
			
 
				 
			
 
				 area_set = get_area_set()
			
 
				-
			
 
				+import jieba
			
 
				 
			
 
				 def get_intellect_search(coll,index_name,name,grade,search_params,output_fields,limit,max_steps=5):
			
 
				 
			
@@ -34,17 +34,22 @@ def get_intellect_search(coll,index_name,name,grade,search_params,output_fields,
 
				     if v is not None:
			
 
				         vector.append(v)
			
 
				     if len(str(name))>=5:
			
 
				-        strides = [3,6]
			
 
				+        name_cut = list(jieba.cut(name))
			
 
				+        strides = [1,2]
			
 
				         for stride in strides:
			
 
				-            steps = len(name)//stride
			
 
				+            steps = len(name_cut)//stride
			
 
				             if len(name)%stride>=stride//2+1:
			
 
				                 steps += 1
			
 
				             _begin = 0
			
 
				+            _name = ""
			
 
				             for i in range(min(steps,max_steps)):
			
 
				-                _name = str(name)[i*stride:(i+1)*stride+2]
			
 
				+                _name += "".join(name_cut[i*stride:(i+1)*stride])
			
 
				+                if len(_name)<2:
			
 
				+                    continue
			
 
				                 v = get_embedding_request(_name)
			
 
				                 if v is not None:
			
 
				                     vector.append(v)
			
 
				+                _name = ""
			
 
				 
			
 
				     if len(vector)>0:
			
 
				         list_search = get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit)
			
@@ -67,10 +72,10 @@ def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_
 
				         except Exception as e:
			
 
				             log("get redis data error")
			
 
				         if _search_list is not None:
			
 
				-            log("_search_list is not None")
			
 
				+            # log("_search_list is not None")
			
 
				             return json.loads(_search_list)
			
 
				         else:
			
 
				-            log("search from milvus")
			
 
				+            # log("search from milvus")
			
 
				             list_result = []
			
 
				             result = coll.search(vector,index_name,search_params,top_k=limit,output_fields=output_fields,limit=limit)
			
 
				             for hits in result:
			
@@ -84,6 +89,9 @@ def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_
 
				                     _d[k] = _search.entity.get(k)
			
 
				                 final_list.append(_d)
			
 
				             final_list = remove_repeat_item(final_list,k="ots_name")
			
 
				+            for _d in final_list:
			
 
				+                _d["length_dis"] = abs(len(_d.get("standard_name",""))-len(name))
			
 
				+            final_list.sort(key=lambda x:x.get("length_dis",0))
			
 
				             final_list.sort(key=lambda x:x.get("level",1))
			
 
				             try:
			
 
				                 db.set(_md5,json.dumps(final_list))
			
@@ -129,6 +137,7 @@ def get_embedding_request(sentence,retry_times=3):
 
				             if _embedding is not None:
			
 
				                 try:
			
 
				                     db.set(_md5,json.dumps(_embedding))
			
 
				+                    db.expire(_md5,60*60)
			
 
				                 except Exception as e:
			
 
				                     log("set redis data error")
			
 
				             return _embedding
			
@@ -230,10 +239,6 @@ def is_similar(source,target,_radio=None):
 
				                 return True
			
 
				         elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
			
 
				             return True
			
 
				-    # 全中文判断是否包含
			
 
				-    if len(source)==max_len and judge_pur_chinese(target):
			
 
				-        if str(source).find(target)>=0:
			
 
				-            return True
			
 
				     return False
			
 
				 
			
 
				 
			
@@ -254,7 +259,15 @@ def check_char(source,target,chat_pattern=re.compile("^[a-zA-Z0-9]+$"),find_patt
 
				         else:
			
 
				             return False
			
 
				 
			
 
				-def check_product(source,target):
			
 
				+def check_product(source,target,remove_words):
			
 
				+
			
 
				+    if remove_words is not None and remove_words!="":
			
 
				+        _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
			
 
				+        list_split = [a.strip() for a in _split if a.strip()!=""]
			
 
				+        for _s in list_split:
			
 
				+            if str(source).find(_s)>=0:
			
 
				+                return False
			
 
				+
			
 
				     _check = check_char(source,target)
			
 
				     if _check:
			
 
				         return True
			
@@ -262,36 +275,48 @@ def check_product(source,target):
 
				         if _check==False:
			
 
				             return False
			
 
				 
			
 
				-    if is_contain(source,target,min_len=2):
			
 
				+    if len(source)>len(target) and target in source:
			
 
				         return True
			
 
				+
			
 
				     max_len = max(len(source),len(target))
			
 
				     min_len = min(len(source),len(target))
			
 
				-    min_ratio = 92
			
 
				+
			
 
				     if min_len<2:
			
 
				         return False
			
 
				     elif max_len<=5:
			
 
				-        min_ratio=94
			
 
				+        min_ratio=96
			
 
				     else:
			
 
				-        min_ratio = 90
			
 
				+        min_ratio = 95
			
 
				+    min_ratio = 98
			
 
				     if is_similar(source,target,min_ratio):
			
 
				         return True
			
 
				     return False
			
 
				 
			
 
				 
			
 
				-def check_brand(source,target):
			
 
				+def check_brand(source,target,remove_words):
			
 
				+
			
 
				+
			
 
				     source = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(source).lower())
			
 
				     target = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(target).lower())
			
 
				 
			
 
				+    if remove_words is not None and remove_words!="":
			
 
				+        _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
			
 
				+        list_split = [a.strip() for a in _split if a.strip()!=""]
			
 
				+        for _s in _split:
			
 
				+            if str(source).find(_s)>=0:
			
 
				+                return False
			
 
				+
			
 
				     max_len = max(len(source),len(target))
			
 
				     min_len = min(len(source),len(target))
			
 
				 
			
 
				-    min_ratio = 92
			
 
				+
			
 
				     if min_len<2:
			
 
				         return False
			
 
				     elif max_len<=5:
			
 
				         min_ratio=94
			
 
				     else:
			
 
				         min_ratio = 90
			
 
				+    min_ratio = 98
			
 
				 
			
 
				     source_c = "".join(get_chinese_string(source))
			
 
				     target_c = "".join(get_chinese_string(target))
			
@@ -305,17 +330,20 @@ def check_brand(source,target):
 
				 
			
 
				     if len(source_c)>=2 and len(target_c)>=2:
			
 
				         if not(source_c in area_set or target_c in area_set):
			
 
				-            if is_similar(source_c,target_c,min_ratio):
			
 
				+            if is_contain(source_c,target_c):
			
 
				                 return True
			
 
				 
			
 
				-            if is_contain(source_c,target_c):
			
 
				+            if is_similar(source_c,target_c,min_ratio):
			
 
				                 return True
			
 
				+
			
 
				     if has_same_specs_count(source,target):
			
 
				-        if is_similar(source,target,min_ratio):
			
 
				-            return True
			
 
				 
			
 
				         if is_contain(source,target):
			
 
				             return True
			
 
				+        if is_similar(source,target,min_ratio):
			
 
				+            return True
			
 
				+
			
 
				+
			
 
				 
			
 
				 SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789.']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
			
 
				 NOT_SPECS_PATTERN = re.compile("[^%s]"%("".join(list(SPECS_CHECK_SET))))
			
@@ -533,10 +561,11 @@ def clean_product_quantity(product_quantity):
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     # print(check_brand('杭州郎基','杭州利华'))
			
 
				-    print(check_product("数字化医用X射线摄影系统（DR）","DR"))
			
 
				+    # print(check_product("医用冷藏箱","医用","a|"))
			
 
				+
			
 
				     # print(re.split("[^\u4e00-\u9fff]",'128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10'))
			
 
				     # import Levenshtein
			
 
				     # print(Levenshtein.ratio('助听器','助行器'))
			
 
				     # print(clean_product_specs("//4008SverssionV10"))
			
 
				-    # print(is_legal_brand(getConnect_ots(),"康复"))
			
 
				+    print(is_legal_brand(getConnect_ots(),"医用外科口罩"))
			
 
				     # print(check_specs("500ml","3500ml"))
			
--- a/BaseDataMaintenance/maintenance/product/product_dict.py
+++ b/BaseDataMaintenance/maintenance/product/product_dict.py
@@ -101,7 +101,7 @@ class Product_Dict_Manager():
 
				 
			
 
				 
			
 
				 
			
 
				-    def embedding_producer(self,columns=[DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_PARENT_ID,DOCUMENT_PRODUCT_DICT_GRADE,DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS]):
			
 
				+    def embedding_producer(self,columns=[DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_PARENT_ID,DOCUMENT_PRODUCT_DICT_GRADE,DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,DOCUMENT_PRODUCT_DICT_LEVEL]):
			
 
				 
			
 
				         bool_query = BoolQuery(
			
 
				             must_queries=[RangeQuery(DOCUMENT_PRODUCT_DICT_GRADE,3,5,True,True)],
			
@@ -131,16 +131,20 @@ class Product_Dict_Manager():
 
				     def embedding_comsumer(self):
			
 
				         def handle(item,result_queue):
			
 
				             try:
			
 
				+                _id = item.get(DOCUMENT_PRODUCT_DICT_ID)
			
 
				                 name = str(item.get(DOCUMENT_PRODUCT_DICT_NAME))[:MAX_NAME_LENGTH]
			
 
				 
			
 
				                 parent_id = item.get(DOCUMENT_PRODUCT_DICT_PARENT_ID)
			
 
				                 grade = item.get(DOCUMENT_PRODUCT_DICT_GRADE)
			
 
				                 Coll,_ = self.get_collection(grade)
			
 
				                 standard_alias = item.get(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,"")
			
 
				+                remove_words = item.get(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,"")
			
 
				+                level = item.get(DOCUMENT_PRODUCT_DICT_LEVEL,1)
			
 
				 
			
 
				-                if insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias):
			
 
				 
			
 
				-                    _pd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:id,DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED})
			
 
				+                if insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_words,level):
			
 
				+
			
 
				+                    _pd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:_id,DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED})
			
 
				                     _pd.update_row(self.ots_client)
			
 
				 
			
 
				 
			
@@ -919,7 +923,7 @@ def dict_interface_delete(name,grade,ots_client = getConnect_ots()):
 
				 
			
 
				 def interface_deletes():
			
 
				     a = '''
			
 
				-    眼科
			
 
				+    明细
			
 
				     '''
			
 
				     grade = 4
			
 
				     ots_client=getConnect_ots()
			
@@ -1009,6 +1013,6 @@ def clean_brands():
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     # start_embedding_product_dict()
			
 
				-    # interface_deletes()
			
 
				+    interface_deletes()
			
 
				     # clean_similar()
			
 
				-    clean_brands()
			
 
				+    # clean_brands()
			
--- a/BaseDataMaintenance/maintenance/product/products.py
+++ b/BaseDataMaintenance/maintenance/product/products.py
@@ -30,12 +30,13 @@ import logging
 
				 root = logging.getLogger()
			
 
				 root.setLevel(logging.INFO)
			
 
				 from uuid import uuid4
			
 
				+from multiprocessing import Queue as PQueue
			
 
				 
			
 
				 class Product_Manager(Product_Dict_Manager):
			
 
				 
			
 
				     def __init__(self):
			
 
				         super(Product_Manager, self).__init__()
			
 
				-        self.process_queue = Queue()
			
 
				+        self.process_queue = PQueue()
			
 
				         self.ots_client = getConnect_ots()
			
 
				 
			
 
				         self.set_id = set()
			
@@ -68,6 +69,7 @@ class Product_Manager(Product_Dict_Manager):
 
				                                                                             columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
			
 
				         list_data = getRow_ots(rows)
			
 
				         _count = len(list_data)
			
 
				+        log("producer %d/%d"%(q_size,total_count))
			
 
				         list_id = []
			
 
				         for _d in list_data:
			
 
				             _id = _d.get(DOCUMENT_PRODUCT_TMP_ID)
			
@@ -113,7 +115,7 @@ class Product_Manager(Product_Dict_Manager):
 
				 
			
 
				 
			
 
				 
			
 
				-    def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]):
			
 
				+    def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]):
			
 
				         '''
			
 
				         Standardizes the product data
			
 
				         通过匹配标准参数表进行标准化，匹配是非精确匹配，校验规则是？
			
@@ -140,8 +142,15 @@ class Product_Manager(Product_Dict_Manager):
 
				         specs = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,"")
			
 
				         parameters = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_PARAMETER,"")
			
 
				 
			
 
				+
			
 
				+        original_name = name
			
 
				+        original_brand = brand
			
 
				+        original_specs = specs
			
 
				+
			
 
				         list_candidates = [a for a in [name,brand,specs,parameters] if a!=""]
			
 
				 
			
 
				+        list_candidate_brand_specs = [a for a in [brand,specs,parameters,name] if a!=""]
			
 
				+
			
 
				         if brand=="" and parameters!="":
			
 
				             brand = parameters
			
 
				         if specs=="" and parameters!="":
			
@@ -165,12 +174,14 @@ class Product_Manager(Product_Dict_Manager):
 
				                 ots_name = _search.get("ots_name")
			
 
				                 standard_name = _search.get("standard_name")
			
 
				                 ots_parent_id = _search.get("ots_parent_id")
			
 
				+                remove_words = _search.get("remove_words")
			
 
				 
			
 
				-                if is_similar(name,ots_name) or check_product(name,ots_name):
			
 
				+                if check_product(name,ots_name,remove_words):
			
 
				                     name_ots_id = ots_id
			
 
				+                    original_name = name
			
 
				                     new_name = standard_name
			
 
				 
			
 
				-                    log("checking name %s succeed %s"%(name,ots_name))
			
 
				+                    log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
			
 
				                     # #update alias of name
			
 
				                     # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
			
 
				                     # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
			
@@ -187,11 +198,13 @@ class Product_Manager(Product_Dict_Manager):
 
				                     ots_name = _search.get("ots_name")
			
 
				                     standard_name = _search.get("standard_name")
			
 
				                     ots_parent_id = _search.get("ots_parent_id")
			
 
				+                    remove_words = _search.get("remove_words")
			
 
				 
			
 
				-                    if is_similar(name,ots_name,_radio=95):
			
 
				+                    if check_product(name,ots_name,remove_words):
			
 
				 
			
 
				-                        log("checking name %s succeed %s"%(name,ots_name))
			
 
				+                        log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
			
 
				                         name_ots_id = ots_id
			
 
				+                        original_name = name
			
 
				                         new_name = standard_name
			
 
				 
			
 
				                         # #update alias of name
			
@@ -206,9 +219,6 @@ class Product_Manager(Product_Dict_Manager):
 
				 
			
 
				                 s_brand = brand
			
 
				                 l_brand = [brand]
			
 
				-                l_brand.append(clean_product_brand(s_brand))
			
 
				-                brand_ch = get_chinese_string(brand)
			
 
				-                l_brand.extend(brand_ch)
			
 
				 
			
 
				                 Coll,_ = self.get_collection(BRAND_GRADE)
			
 
				 
			
@@ -223,14 +233,16 @@ class Product_Manager(Product_Dict_Manager):
 
				                         ots_name = _search.get("ots_name")
			
 
				                         standard_name = _search.get("standard_name")
			
 
				                         ots_parent_id = _search.get("ots_parent_id")
			
 
				+                        remove_words = _search.get("remove_words")
			
 
				 
			
 
				                         # log("check brand %s and %s"%(brand,ots_name))
			
 
				-                        if is_similar(brand,ots_name) or check_brand(brand,ots_name):
			
 
				+                        if check_brand(brand,ots_name,remove_words):
			
 
				 
			
 
				                             # log("check brand similar succeed：%s and %s"%(brand,ots_name))
			
 
				 
			
 
				                             if ots_name==new_name:
			
 
				                                 continue
			
 
				+                            original_brand = brand
			
 
				                             new_brand = standard_name
			
 
				 
			
 
				                             log("checking brand %s succeed %s"%(brand,new_brand))
			
@@ -277,6 +289,7 @@ class Product_Manager(Product_Dict_Manager):
 
				                             new_brand = clean_product_brand(brand)
			
 
				                             if new_brand=="":
			
 
				                                 continue
			
 
				+                            original_brand = brand
			
 
				                             log("adding new brand %s"%(str(new_brand)))
			
 
				                             _d_brand = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
			
 
				                                         DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_brand,
			
@@ -299,9 +312,6 @@ class Product_Manager(Product_Dict_Manager):
 
				                     if _find:
			
 
				                         break
			
 
				                     l_brand = [brand]
			
 
				-                    l_brand.append(clean_product_brand(brand))
			
 
				-                    brand_ch = get_chinese_string(brand)
			
 
				-                    l_brand.extend(brand_ch)
			
 
				 
			
 
				                     for brand in l_brand:
			
 
				                         if _find:
			
@@ -315,12 +325,14 @@ class Product_Manager(Product_Dict_Manager):
 
				                             ots_name = _search.get("ots_name")
			
 
				                             standard_name = _search.get("standard_name")
			
 
				                             ots_parent_id = _search.get("ots_parent_id")
			
 
				+                            remove_words = _search.get("remove_words")
			
 
				 
			
 
				                             # log("check brand %s and %s"%(brand,ots_name))
			
 
				-                            if check_brand(brand,ots_name):
			
 
				+                            if check_brand(brand,ots_name,remove_words):
			
 
				                                 # log("check brand similar succeed：%s and %s"%(brand,ots_name))
			
 
				                                 if ots_name==new_name:
			
 
				                                     continue
			
 
				+                                orignal_brand = brand
			
 
				                                 new_brand = standard_name
			
 
				 
			
 
				                                 log("checking brand %s succeed %s"%(brand,new_brand))
			
@@ -392,6 +404,7 @@ class Product_Manager(Product_Dict_Manager):
 
				                                 # log("specs is_similar")
			
 
				                                 if check_specs(c_specs,ots_name):
			
 
				                                     break_flag = True
			
 
				+                                    original_specs = c_specs
			
 
				                                     new_specs = standard_name
			
 
				                                     log("check_specs %s succeed %s"%(specs,new_specs))
			
 
				 
			
@@ -435,6 +448,7 @@ class Product_Manager(Product_Dict_Manager):
 
				                     for specs in list_similar_specs:
			
 
				                         if is_legal_specs(specs) and len(specs)<MAX_NAME_LENGTH and len(specs)>=5:
			
 
				                             debug("is_legal_specs")
			
 
				+                            original_specs = specs
			
 
				                             new_specs = clean_product_specs(specs)
			
 
				                             # insert into document_product_dict a new record
			
 
				                             # to update the document_product_dict which is builded for search
			
@@ -471,7 +485,7 @@ class Product_Manager(Product_Dict_Manager):
 
				                             break
			
 
				         if specs_ots_id is None:
			
 
				             _find = False
			
 
				-            for specs in list_candidates:
			
 
				+            for specs in list_candidate_brand_specs:
			
 
				                 if _find:
			
 
				                     break
			
 
				 
			
@@ -505,10 +519,11 @@ class Product_Manager(Product_Dict_Manager):
 
				                             ots_parent_id = _search.get("ots_parent_id")
			
 
				 
			
 
				                             debug("checking specs %s and %s"%(specs,ots_name))
			
 
				-                            if is_similar(specs,ots_name):
			
 
				+                            if is_similar(c_specs,ots_name):
			
 
				                                 # log("specs is_similar")
			
 
				                                 if check_specs(c_specs,ots_name):
			
 
				                                     break_flag = True
			
 
				+                                    original_specs = c_specs
			
 
				                                     new_specs = standard_name
			
 
				                                     if brand_ots_id is not None:
			
 
				                                         # judge if the specs which parent_id is brand_ots_id exists,insert one if not exists else update alias
			
@@ -610,9 +625,9 @@ class Product_Manager(Product_Dict_Manager):
 
				 
			
 
				             _product.setValue(DOCUMENT_PRODUCT_CREATE_TIME,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
			
 
				 
			
 
				-            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,""),True)
			
 
				-            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,""),True)
			
 
				-            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,""),True)
			
 
				+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,original_name,True)
			
 
				+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,original_brand,True)
			
 
				+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,original_specs,True)
			
 
				 
			
 
				             bid_filemd5s = self.get_bid_filemd5s(docid,self.ots_client)
			
 
				             if bid_filemd5s is not None:
			
@@ -852,13 +867,14 @@ def fix_product_data():
 
				     # table_name = "document_product"
			
 
				     # table_index = "document_product_index"
			
 
				 
			
 
				-    columns = [DOCUMENT_PRODUCT_TMP_NEW_ID]
			
 
				+    columns = [DOCUMENT_PRODUCT_TMP_NEW_ID,DOCUMENT_PRODUCT_TMP_STATUS]
			
 
				     ots_client = getConnect_ots()
			
 
				-    bool_query = BoolQuery(must_queries=[
			
 
				+    bool_query = BoolQuery(should_queries=[
			
 
				         # RangeQuery("status",501),
			
 
				         # TermQuery("docid",246032980)
			
 
				 
			
 
				-        RangeQuery("status",201,301)
			
 
				+        RangeQuery("status",201,301),
			
 
				+        RangeQuery("status",401,451)
			
 
				         # WildcardQuery(DOCUMENT_PRODUCT_ORIGINAL_SPECS,"MFUSOne")
			
 
				         # TermQuery(DOCUMENT_PRODUCT_SPECS,"MFUSOne")
			
 
				     ])
			
@@ -923,9 +939,10 @@ def fix_product_data():
 
				         dpt.update_row(ots_client)
			
 
				 
			
 
				         new_id = item.get(DOCUMENT_PRODUCT_TMP_NEW_ID)
			
 
				-        _d = {DOCUMENT_PRODUCT_ID:new_id}
			
 
				-        dp = Document_product(_d)
			
 
				-        dp.delete_row(ots_client)
			
 
				+        if new_id is not None and new_id!="":
			
 
				+            _d = {DOCUMENT_PRODUCT_ID:new_id}
			
 
				+            dp = Document_product(_d)
			
 
				+            dp.delete_row(ots_client)
			
 
				 
			
 
				     def handle(item,result_queue):
			
 
				         win_bid_price = item.get(DOCUMENT_PRODUCT_TMP_WIN_BID_PRICE,1)
			
@@ -989,7 +1006,7 @@ def test_check_brand():
 
				             f.write(b+"\n")
			
 
				 
			
 
				 def test_match():
			
 
				-    a = "数字化医用X射线摄影系统（DR）"
			
 
				+    a = "兽医设备"
			
 
				 
			
 
				 
			
 
				     # vector = request_embedding(get_milvus_standard_name(a))
			
@@ -999,7 +1016,7 @@ def test_match():
 
				     Coll,_ = pm.get_collection(_GRADE)
			
 
				     print(Coll.name)
			
 
				 
			
 
				-    output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]
			
 
				+    output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]
			
 
				     # start_time = time.time()
			
 
				     # print(Coll.query(expr=" ots_id in ['75058b275a4c1d8ee38b58c5c5cce3bb'] ",output_fields=output_fields))
			
 
				     # print("cost",time.time()-start_time)
			
@@ -1257,19 +1274,57 @@ def delete_specs():
 
				     mt.run()
			
 
				     Coll.compact()
			
 
				 
			
 
				+def remove_redis_keys():
			
 
				+    db = redis.Redis(connection_pool=pool_product)
			
 
				+    db.flushdb()
			
 
				+
			
 
				+
			
 
				+def update_document_product_dict():
			
 
				+    import pandas as pd
			
 
				+    filename = "update_product.csv"
			
 
				+    df = pd.read_csv(filename,encoding="gbk")
			
 
				+    ots_client = getConnect_ots()
			
 
				+    for name,grade,standard_alias,remove_words,level in zip(df["name"],df["grade"],df["standard_alias"],df["remove_words"],df["level"]):
			
 
				+        name = name.strip()
			
 
				+        bool_query = BoolQuery(must_queries=[
			
 
				+            TermQuery(DOCUMENT_PRODUCT_DICT_NAME,name),
			
 
				+            TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,grade)
			
 
				+        ])
			
 
				+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
			
 
				+                                                                       SearchQuery(bool_query,get_total_count=True),
			
 
				+                                                                       ColumnsToGet(return_type=ColumnReturnType.NONE))
			
 
				+        if total_count==1:
			
 
				+            list_data = getRow_ots(rows)
			
 
				+            _data = list_data[0]
			
 
				+            dpd = Document_product_dict(_data)
			
 
				+            level = 1
			
 
				+            if re.search("器械|设备|其他",name) is not None and level==1:
			
 
				+                level = 2
			
 
				+            if str(remove_words)=="nan":
			
 
				+                remove_words = ""
			
 
				+            dpd.setValue(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,standard_alias,True)
			
 
				+            dpd.setValue(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,remove_words,True)
			
 
				+            dpd.setValue(DOCUMENT_PRODUCT_DICT_LEVEL,level,True)
			
 
				+            dpd.setValue(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED,IS_SYNCHONIZED+1,True)
			
 
				+            dpd.update_row(ots_client)
			
 
				+            print(dpd.getProperties())
			
 
				+
			
 
				 
			
 
				 
			
 
				 def test():
			
 
				     # pm = Product_Manager()
			
 
				     # pm.test()
			
 
				-    # fix_product_data()
			
 
				+    fix_product_data()
			
 
				     # test_check_brand()
			
 
				     # test_match()
			
 
				-    rebuild_milvus()
			
 
				+    # rebuild_milvus()
			
 
				 
			
 
				     # move_document_product()
			
 
				     # delete_brands()
			
 
				     # delete_specs()
			
 
				+    # remove_redis_keys()
			
 
				+    # update_document_product_dict()
			
 
				+
			
 
				 
			
 
				 if __name__ == '__main__':