Переглянути джерело

Merge remote-tracking branch 'origin/master'

znj 1 рік тому
батько
коміт
054b7dfe34

+ 1 - 0
.gitignore

@@ -4,3 +4,4 @@
 /.idea/
 /attachmentProcessTime2.xlsx
 /BaseDataMaintenance/maintenance/attachment/2022-01-18_183521_export11.xlsx
+/BaseDataMaintenance/test/

+ 3 - 0
.idea/encodings.xml

@@ -4,6 +4,9 @@
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/attachmentProcessTime.xlsx" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/dataSource/searchPaddle.py" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/attachment/2022-01-18_183521_export11.xlsx" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/select_product_exclude_name_from_tw_prod.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/select_product_product_name_exclude_name.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/BaseDataMaintenance/maintenance/product/update_product.csv" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/model/ots/2022-01-19_214304_export11.xlsx" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BaseDataMaintenance/model/ots/2022-10-14_190838_数据导出.xlsx" charset="GBK" />
     <file url="file://$PROJECT_DIR$/attachmentProcessTime2.xlsx" charset="GBK" />

+ 2 - 2
BaseDataMaintenance/common/milvusUtil.py

@@ -18,7 +18,7 @@ def create_embedding_schema(collection_name,fields,index_name,index_params):
     if not has:
         print("creating collection")
         coll_schema = CollectionSchema(fields,"this is the embedding schema")
-        coll = Collection(collection_name,coll_schema,consistency_level="Strong")
+        coll = Collection(collection_name,coll_schema)
 
         #create index for milvus_embedding
         coll.create_index(index_name,index_params=index_params)
@@ -47,7 +47,7 @@ def insert_embedding(coll,entities,retry_times =3):
 
 
 def search_embedding(coll,index_name,vector,search_params,output_fields,limit=3,retry_times=3):
-    for _ in retry_times:
+    for _ in range(retry_times):
         try:
             list_result = []
             result = coll.search(vector,index_name,search_params,top_k=limit,output_fields=output_fields,limit=limit)

+ 2 - 2
BaseDataMaintenance/dataSource/source.py

@@ -155,12 +155,12 @@ def getConnect_redis_doc():
 
 def getConnect_redis_product():
     db = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT,
-                           db=8,password=REDIS_PASS)
+                           db=9,password=REDIS_PASS)
     return db
 
 def getConnect_redis_product_pool():
     pool = redis.ConnectionPool(host=REDIS_HOST, port=REDIS_PORT,
-                                db=8,password=REDIS_PASS,max_connections=40)
+                                db=9,password=REDIS_PASS,max_connections=40)
     return pool
 
 if __name__=="__main__":

+ 13 - 9
BaseDataMaintenance/maintenance/dataflow.py

@@ -2796,7 +2796,7 @@ class Dataflow_dumplicate(Dataflow):
 
 
     def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
-        def producer(columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status]):
+        def producer(columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document]):
             q_size = self.queue_dumplicate.qsize()
             log("dumplicate queue size %d"%(q_size))
             if q_size>flow_process_count//3:
@@ -2806,7 +2806,7 @@ class Dataflow_dumplicate(Dataflow):
                 # TermQuery("docid",271983871)
             ])
             rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
-                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
+                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(document_update_document,SortOrder.DESC),FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
                                                                                 ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
             log("flow_dumplicate producer total_count:%d"%total_count)
             list_dict = getRow_ots(rows)
@@ -2842,8 +2842,8 @@ class Dataflow_dumplicate(Dataflow):
 
     def flow_dumpcate_comsumer(self):
         from multiprocessing import Process
-        process_count = 1
-        thread_count = 20
+        process_count = 2
+        thread_count = 30
         list_process = []
         def start_thread():
             mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,thread_count,1,need_stop=False,ots_client=self.ots_client)
@@ -3861,7 +3861,7 @@ class Dataflow_dumplicate(Dataflow):
             set_docid = set()
 
             list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,get_all=get_all,to_log=False)
-            print("len_rules",len(list_rules),table_name,table_index)
+            # print("len_rules",len(list_rules),table_name,table_index)
             list_rules.sort(key=lambda x:x["confidence"],reverse=True)
             _i = 0
             step = 5
@@ -3888,15 +3888,15 @@ class Dataflow_dumplicate(Dataflow):
 
 
             _time = time.time()
-            log("%d start final check with length:%d"%(item["docid"],len(base_list)))
+            # log("%d start final check with length:%d"%(item["docid"],len(base_list)))
             final_list = self.dumplicate_fianl_check(base_list)
 
             exist_finterprint = self.is_exist_fingerprint(final_list,item.get(document_tmp_docid),item.get(document_tmp_fingerprint),table_name)
-            log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
+            # log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
             best_docid = self.get_best_docid(final_list)
 
             final_list_docid = [a["docid"] for a in final_list]
-            log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
+            # log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
             _d = {"partitionkey":item["partitionkey"],
                   "docid":item["docid"],
                   "status":random.randint(*flow_dumplicate_status_to),
@@ -3914,6 +3914,7 @@ class Dataflow_dumplicate(Dataflow):
 
             remove_list = []
 
+
             if len(final_list)==0 or best_docid==item.get(document_tmp_docid):
                 dtmp.setValue(document_tmp_save,1,True)
                 # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
@@ -3940,6 +3941,9 @@ class Dataflow_dumplicate(Dataflow):
             list_docids.append(best_docid)
             b_log = False if upgrade else True
 
+            if item.get(document_update_document)=="true":
+                dtmp.setValue(document_tmp_save,1,True)
+
             if exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0:
                 log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
                 dtmp.setValue(document_tmp_projects,"[]",True)
@@ -4035,7 +4039,7 @@ class Dataflow_dumplicate(Dataflow):
         schedule.add_job(self.flow_dumplicate,"cron",second="*/40")
         schedule.add_job(self.flow_dumpcate_comsumer,"cron",second="*/10")
         schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
-        schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
+        # schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
         schedule.start()
 
     def changeSaveStatus(self,list_dict):

+ 5 - 1
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -763,7 +763,11 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                 try:
                     _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
                     _soup = BeautifulSoup(_dochtmlcon,"lxml")
-                    _soup = article_limit(_soup,50000)
+                    if len(_dochtmlcon)>200000:
+                        _find = _soup.find("div",attrs={"class":"richTextFetch"})
+                        _find.decompose()
+                    else:
+                        _soup = article_limit(_soup,50000)
                     _dochtmlcon = str(_soup)
                 except Exception as e:
                     traceback.print_exc()

+ 48 - 7
BaseDataMaintenance/maintenance/product/1.py

@@ -24,14 +24,55 @@ x = [[1,1,22,2,2,2,2],
      [3,1,22,2,2,2,2],
      [1.5,1,22,2,2,2,2]]
 km.fit(x)
-print(km.predict(x))
-
-from uuid import uuid4
-
-print(type(uuid4().hex))
-
-print(5==5.00)
 
+a = '''
+bidding_budget double,
+    brand_specs string,
+    province string,
+    city STRING,
+    district string,
+    create_time string,
+    dict_name_id string,
+    docchannel bigint,
+    docid bigint,
+    doctitle string,
+    full_name string,
+    industry string,
+    info_type string,
+    page_time string,
+    page_time_year string,
+    procurement_system STRING,
+    project_code string,
+    project_name string,
+    quantity bigint,
+    quantity_unit string,
+    supplier string,
+    tenderee string,
+    tenderee_contact string,
+    tenderee_phone string,
+    update_time string,
+    win_bid_price double,
+    win_tenderer string,
+    win_tenderer_manager string,
+    win_tenderer_phone string,
+    dict_brand_id string,
+    dict_specs_id string,
+    dump_id string,
+    total_price double,
+    unit_price double,
+    bid_filemd5s string
+'''
+
+list_c = []
+for b in a.split("\n"):
+     c = b.strip()
+     if c=="":
+          continue
+     d = c.split(" ")[0]
+     list_c.append(d)
+print(",".join(list_c))
+
+print("BENEHEARTD6".lower()=="BeneHeartD6".lower())
 
 
 

+ 157 - 30
BaseDataMaintenance/maintenance/product/productUtils.py

@@ -25,6 +25,39 @@ from BaseDataMaintenance.model.ots.enterprise import *
 from BaseDataMaintenance.maintenance.product.make_brand_pattern import get_area_set
 
 area_set = get_area_set()
+import jieba
+
+def get_intellect_search(coll,index_name,name,grade,search_params,output_fields,limit,max_steps=5):
+
+    vector = []
+    v = get_embedding_request(name)
+    if v is not None:
+        vector.append(v)
+    if len(str(name))>=5:
+        name_cut = list(jieba.cut(name))
+        strides = [1,2]
+        for stride in strides:
+            steps = len(name_cut)//stride
+            if len(name)%stride>=stride//2+1:
+                steps += 1
+            _begin = 0
+            _name = ""
+            for i in range(min(steps,max_steps)):
+                _name += "".join(name_cut[i*stride:(i+1)*stride])
+                if len(_name)<2:
+                    continue
+                v = get_embedding_request(_name)
+                if v is not None:
+                    vector.append(v)
+                _name = ""
+
+    if len(vector)>0:
+        list_search = get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit)
+        if list_search:
+            return list_search
+
+    return []
+
 
 def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit=3):
 
@@ -39,8 +72,10 @@ def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_
         except Exception as e:
             log("get redis data error")
         if _search_list is not None:
+            # log("_search_list is not None")
             return json.loads(_search_list)
         else:
+            # log("search from milvus")
             list_result = []
             result = coll.search(vector,index_name,search_params,top_k=limit,output_fields=output_fields,limit=limit)
             for hits in result:
@@ -53,7 +88,14 @@ def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_
                 for k in output_fields:
                     _d[k] = _search.entity.get(k)
                 final_list.append(_d)
-            final_list = remove_repeat_item(final_list,k="standard_name")
+            final_list = remove_repeat_item(final_list,k="ots_name")
+            for _d in final_list:
+                # _d["length_dis"] = abs(len(_d.get("standard_name",""))-len(name))
+                standard_set = set(_d.get("standard_name",""))
+                name_set = set(name)
+                _d["length_dis"] = len(standard_set&name_set)/max(len(standard_set)+len(name_set),1)
+            final_list.sort(key=lambda x:x.get("length_dis",0),reverse=True)
+            final_list.sort(key=lambda x:x.get("level",1))
             try:
                 db.set(_md5,json.dumps(final_list))
                 db.expire(_md5,2*60)
@@ -85,7 +127,7 @@ def get_embedding_request(sentence,retry_times=3):
     db = redis.Redis(connection_pool=pool_product)
 
     try:
-        _md5 = getMD5(str(sentence))+"_embedding"
+        _md5 = getMD5(get_milvus_standard_name(sentence))+"_embedding"
         _embedding = None
         try:
             _embedding = db.get(_md5)
@@ -98,6 +140,7 @@ def get_embedding_request(sentence,retry_times=3):
             if _embedding is not None:
                 try:
                     db.set(_md5,json.dumps(_embedding))
+                    db.expire(_md5,60*60)
                 except Exception as e:
                     log("set redis data error")
             return _embedding
@@ -181,12 +224,15 @@ def is_similar(source,target,_radio=None):
     #判断相似度
     similar = fuzz.ratio(source,target)
     if similar>=min_ratio:
+        log("%s and %s similar_jaro %d"%(source,target,similar))
         return True
     similar_jaro = Levenshtein.jaro(source,target)
     if similar_jaro*100>=min_ratio:
+        log("%s and %s similar_jaro %d"%(source,target,similar_jaro*100))
         return True
     similar_jarow = Levenshtein.jaro_winkler(source,target)
-    if similar_jarow*100>=90:
+    if similar_jarow*100>=min_ratio:
+        log("%s and %s similar_jaro %d"%(source,target,similar_jarow*100))
         return True
 
     if min_len>=5:
@@ -196,13 +242,6 @@ def is_similar(source,target,_radio=None):
                 return True
         elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
             return True
-    # 全中文判断是否包含
-    if len(source)==max_len and judge_pur_chinese(target):
-        if str(source).find(target)>=0:
-            return True
-    if len(target)==max_len and judge_pur_chinese(source):
-        if target.find(source)>=0:
-            return True
     return False
 
 
@@ -214,18 +253,103 @@ def is_contain(source,target,min_len=2):
         return True
     return False
 
-def check_product(source,target):
-    if is_contain(source,target,min_len=3):
+def check_char(source,target,chat_pattern=re.compile("^[a-zA-Z0-9\-]+$"),find_pattern=re.compile("(?P<product>[a-zA-Z0-9-]+)")):
+    if re.search(chat_pattern,source) is not None or re.search(chat_pattern,target) is not None:
+        a = set(re.findall(find_pattern,source))
+        b = set(re.findall(find_pattern,target))
+        if len(a&b)>0:
+            return True
+        else:
+            return False
+
+def check_product(source,target,remove_words):
+
+    if remove_words is not None and remove_words!="":
+        _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+        list_split = [a.strip() for a in _split if a.strip()!=""]
+        for _s in list_split:
+            if str(source).find(_s)>=0:
+                return False
+
+    _check = check_char(source,target)
+    if _check:
+        return True
+    else:
+        if _check==False:
+            return False
+
+    if len(source)>len(target) and target in source:
+        return True
+
+    max_len = max(len(source),len(target))
+    min_len = min(len(source),len(target))
+
+    if min_len<2:
+        return False
+    elif max_len<=5:
+        min_ratio=96
+    else:
+        min_ratio = 95
+    min_ratio = 98
+    if is_similar(source,target,min_ratio):
         return True
     return False
 
 
-def check_brand(source,target):
-    source = str(source).lower()
-    target = str(target).lower()
+def check_brand(source,target,remove_words):
+
+
+    source = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(source).lower())
+    target = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(target).lower())
+
+    if remove_words is not None and remove_words!="":
+        _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
+        list_split = [a.strip() for a in _split if a.strip()!=""]
+        for _s in _split:
+            if str(source).find(_s)>=0:
+                return False
 
-    if is_contain(source,target):
+
+    max_len = max(len(source),len(target))
+    min_len = min(len(source),len(target))
+
+
+    if min_len<2:
+        return False
+    elif max_len<=5:
+        min_ratio=94
+    else:
+        min_ratio = 90
+    min_ratio = 98
+
+    source_c = "".join(get_chinese_string(source))
+    target_c = "".join(get_chinese_string(target))
+
+    _check = check_char(source,target)
+    if _check:
         return True
+    else:
+        if _check==False:
+            return False
+
+    if len(source_c)>=2 and len(target_c)>=2:
+        if not(source_c in area_set or target_c in area_set):
+            if is_contain(source_c,target_c):
+                return True
+
+            if is_similar(source_c,target_c,min_ratio):
+                return True
+        else:
+            return False
+
+    if has_same_specs_count(source,target):
+
+        if is_contain(source,target):
+            return True
+        if is_similar(source,target,min_ratio):
+            return True
+
+
 
 SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789.']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
 NOT_SPECS_PATTERN = re.compile("[^%s]"%("".join(list(SPECS_CHECK_SET))))
@@ -249,7 +373,7 @@ def has_same_specs_count(source, target):
                 dict_target[s] = 0
             dict_target[s] += 1
     union_keys = set(list(dict_source.keys())) & set(list(dict_target.keys()))
-    if len(dict_source.keys())!= len(union_keys):
+    if len(dict_source.keys())!= len(union_keys) or len(dict_target.keys())!= len(union_keys):
         return False
     for k,v in dict_source.items():
         if v!=dict_target.get(k):
@@ -270,7 +394,7 @@ def is_legal_brand(ots_client,brand):
         TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION,"delete")
     ])
 
-    rows,next_token,total_count,is_all_succeed = ots_client.search("document_product_dict_interface","document_product_dict_interface_index",
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
                                                                         SearchQuery(bool_query,get_total_count=True))
     if total_count>0:
         return False
@@ -284,7 +408,7 @@ def is_legal_brand(ots_client,brand):
         ])
 
     ])
-    rows,next_token,total_count,is_all_succeed = ots_client.search("document_product_dict","document_product_dict_index",
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
                                                                         SearchQuery(bool_query,get_total_count=True))
     if total_count>0:
         return False
@@ -312,7 +436,7 @@ def is_legal_brand(ots_client,brand):
     bool_query = BoolQuery(must_queries=[
         TermQuery(DOCUMENT_PRODUCT_TMP_BRAND,brand)
     ])
-    rows,next_token,total_count,is_all_succeed = ots_client.search("document_product_temp","document_product_temp_index",
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_tmp_table_name,Document_product_tmp_table_name+"_index",
                                                                         SearchQuery(bool_query,get_total_count=True))
 
     if total_count>=5:
@@ -369,6 +493,7 @@ import requests
 session = requests.Session()
 def request_embedding(sentence,retry_times=3):
     for _ in range(retry_times):
+        sentence = get_milvus_standard_name(sentence)
         resp = session.post(embedding_url,json={"sentence":sentence})
         if resp.status_code==200:
             content = resp.content.decode("utf-8")
@@ -394,17 +519,17 @@ def clean_product_brand(product_brand):
     _search = re.search("品牌[::;;](?P<brand>.{2,8}?)([.。、;::]|规格|型号|生产厂家|厂家)",product_brand)
     if _search is not None:
         product_brand = _search.groupdict().get("brand")
-    brand = re.sub("[/\\,,、.|等]|一批|/无|品牌",'',product_brand)
+    brand = re.sub("[/\\,,、.|等]|一批|/无|品牌|^[/.]+",'',product_brand)
     return brand
 
 
-def clean_product_specs(product_specs):
+def clean_product_specs(product_specs,_PATTERN = re.compile("[^A-Za-z0-9-\\/()().×*]|^[\\/.-]+")):
     '''
     clean before insert
     :param product_specs:
     :return:
     '''
-    _specs = re.sub(SPECS_PATTERN,'',product_specs)
+    _specs = re.sub(_PATTERN,'',product_specs)
     if len(_specs)>0:
         return _specs
     return product_specs
@@ -441,10 +566,12 @@ def clean_product_quantity(product_quantity):
     return ""
 
 if __name__ == '__main__':
-    print(is_similar('128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10','VolusonE10'))
-    print(re.split("[^\u4e00-\u9fff]",'128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10'))
-    import Levenshtein
-    print(Levenshtein.ratio('助听器','助行器'))
-    a = "无锡贝尔森品牌"
-    print(clean_product_brand(a))
-    print(is_legal_brand(getConnect_ots(),"液晶显示"))
+    # print(check_brand('DYW-JY-T01-A1(定制)','JY',''))
+    # print(check_product("医用冷藏箱","医用","a|"))
+
+    # print(re.split("[^\u4e00-\u9fff]",'128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10'))
+    # import Levenshtein
+    # print(Levenshtein.ratio('助听器','助行器'))
+    # print(clean_product_specs("//4008SverssionV10"))
+    print(is_legal_brand(getConnect_ots(),"保健"))
+    # print(check_specs("500ml","3500ml"))

Різницю між файлами не показано, бо вона завелика
+ 603 - 200
BaseDataMaintenance/maintenance/product/product_dict.py


+ 7 - 3
BaseDataMaintenance/maintenance/product/product_setting.py

@@ -11,9 +11,13 @@ PRODUCT_TMEP_STATUS_TO_SYNC = [201,300]
 PRODUCT_TEMP_STATUS_TO_NO_SYNC = [401,450]
 PRODUCT_TEMP_STATUS_TO_REPEATED = [451,500]
 
-COLLECTION_NAME_NAME = "product_dict_embedding_name"
-COLLECTION_NAME_BRAND = "product_dict_embedding_brand"
-COLLECTION_NAME_SPECS = "product_dict_embedding_specs"
+# COLLECTION_NAME_NAME = "product_dict_embedding_name"
+# COLLECTION_NAME_BRAND = "product_dict_embedding_brand"
+# COLLECTION_NAME_SPECS = "product_dict_embedding_specs"
+
+COLLECTION_NAME_NAME = "product_dict_embedding_name_single"
+COLLECTION_NAME_BRAND = "product_dict_embedding_brand_single"
+COLLECTION_NAME_SPECS = "product_dict_embedding_specs_single"
 
 NAME_GRADE = 3
 BRAND_GRADE = 4

+ 740 - 256
BaseDataMaintenance/maintenance/product/products.py

@@ -13,6 +13,7 @@ from BaseDataMaintenance.model.ots.document_product_dict_interface import *
 from BaseDataMaintenance.model.ots.document import *
 from BaseDataMaintenance.model.ots.attachment import *
 from BaseDataMaintenance.model.ots.enterprise import *
+from BaseDataMaintenance.model.ots.project import *
 
 from tablestore import *
 
@@ -24,18 +25,19 @@ from BaseDataMaintenance.maintenance.product.product_dict import Product_Dict_Ma
 from apscheduler.schedulers.blocking import BlockingScheduler
 
 from BaseDataMaintenance.maintenance.product.make_brand_pattern import *
-from BaseDataMaintenance.maintenance.product.product_dict import IS_SYNCHONIZED
+from BaseDataMaintenance.maintenance.product.product_dict import *
 import logging
 
 root = logging.getLogger()
 root.setLevel(logging.INFO)
 from uuid import uuid4
+from multiprocessing import Queue as PQueue
 
 class Product_Manager(Product_Dict_Manager):
 
     def __init__(self):
         super(Product_Manager, self).__init__()
-        self.process_queue = Queue()
+        self.process_queue = PQueue()
         self.ots_client = getConnect_ots()
 
         self.set_id = set()
@@ -63,11 +65,12 @@ class Product_Manager(Product_Dict_Manager):
             return
         bool_query = BoolQuery(must_queries=[RangeQuery(DOCUMENT_PRODUCT_TMP_STATUS,1,51)])
 
-        rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product_temp","document_product_temp_index",
+        rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_tmp_table_name,Document_product_tmp_table_name+"_index",
                                                                             SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
                                                                             columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
         list_data = getRow_ots(rows)
         _count = len(list_data)
+        log("producer %d/%d"%(q_size,total_count))
         list_id = []
         for _d in list_data:
             _id = _d.get(DOCUMENT_PRODUCT_TMP_ID)
@@ -76,7 +79,7 @@ class Product_Manager(Product_Dict_Manager):
             list_id.append(_id)
             self.process_queue.put(_d)
         while next_token:
-            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product_temp","document_product_temp_index",
+            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_tmp_table_name,Document_product_tmp_table_name+"_index",
                                                                                 SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                 columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
             list_data = getRow_ots(rows)
@@ -109,11 +112,14 @@ class Product_Manager(Product_Dict_Manager):
 
 
     def comsumer_handle(self,item,result_queue):
-        self.standardize(item)
+        try:
+            self.standardize(item)
+        except Exception as e:
+            traceback.print_exc()
 
 
 
-    def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]):
+    def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]):
         '''
         Standardizes the product data
         通过匹配标准参数表进行标准化,匹配是非精确匹配,校验规则是?
@@ -135,13 +141,24 @@ class Product_Manager(Product_Dict_Manager):
 
         document_product_tmp = Document_product_tmp(tmp_dict)
 
+        tenderee = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_TENDEREE,"")
+
         name = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,"")
         brand = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,"")
         specs = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,"")
         parameters = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_PARAMETER,"")
 
+        name = name.replace(tenderee,"")
+        brand = brand.replace(tenderee,"")
+
+        original_name = name
+        original_brand = brand
+        original_specs = specs
+
         list_candidates = [a for a in [name,brand,specs,parameters] if a!=""]
 
+        list_candidate_brand_specs = [a for a in [brand,specs,parameters,name] if a!=""]
+
         if brand=="" and parameters!="":
             brand = parameters
         if specs=="" and parameters!="":
@@ -156,20 +173,47 @@ class Product_Manager(Product_Dict_Manager):
         brand_ots_id = None
         specs_ots_id = None
         if name is not None and name!="":
-            name_vector = get_embedding_request(name)
-            if name_vector is not None:
+            Coll,_ = self.get_collection(NAME_GRADE)
+
+            search_list = get_intellect_search(Coll,embedding_index_name,name,NAME_GRADE,self.search_params,output_fields,limit=10)
+
+            for _search in search_list:
+                ots_id = _search.get("standard_name_id")
+                ots_name = _search.get("ots_name")
+                standard_name = _search.get("standard_name")
+                ots_parent_id = _search.get("ots_parent_id")
+                remove_words = _search.get("remove_words")
+
+                if check_product(name,ots_name,remove_words):
+                    name_ots_id = get_document_product_dict_id(ots_parent_id,standard_name)
+                    original_name = name
+                    new_name = standard_name
+
+                    log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
+                    # #update alias of name
+                    # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
+                    # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
+                    # if _flag and _dpd.updateAlias(name):
+                    #     _dpd.update_row(self.ots_client)
+                    break
+        if name_ots_id is None:
+            for name in list_candidates:
                 Coll,_ = self.get_collection(NAME_GRADE)
-
-                search_list = get_embedding_search(Coll,embedding_index_name,name,NAME_GRADE,[name_vector],self.search_params,output_fields,limit=60)
+                search_list = get_intellect_search(Coll,embedding_index_name,name,NAME_GRADE,self.search_params,output_fields,limit=10)
 
                 for _search in search_list:
                     ots_id = _search.get("standard_name_id")
-                    ots_name = _search.get("standard_name")
+                    ots_name = _search.get("ots_name")
+                    standard_name = _search.get("standard_name")
                     ots_parent_id = _search.get("ots_parent_id")
+                    remove_words = _search.get("remove_words")
+
+                    if check_product(name,ots_name,remove_words):
 
-                    if is_similar(name,ots_name) or check_product(name,ots_name):
-                        name_ots_id = ots_id
-                        new_name = ots_name
+                        log("checking name %s succeed %s %s"%(name,ots_name,str(remove_words)))
+                        name_ots_id = get_document_product_dict_id(ots_parent_id,standard_name)
+                        original_name = name
+                        new_name = standard_name
 
                         # #update alias of name
                         # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
@@ -177,110 +221,104 @@ class Product_Manager(Product_Dict_Manager):
                         # if _flag and _dpd.updateAlias(name):
                         #     _dpd.update_row(self.ots_client)
                         break
-        if name_ots_id is None:
-            for name in list_candidates:
-                name_vector = get_embedding_request(name)
-                if name_vector is not None:
-                    Coll,_ = self.get_collection(NAME_GRADE)
-                    search_list = get_embedding_search(Coll,embedding_index_name,name,NAME_GRADE,[name_vector],self.search_params,output_fields,limit=20)
-
-                    for _search in search_list:
-                        ots_id = _search.get("standard_name_id")
-                        ots_name = _search.get("standard_name")
-                        ots_parent_id = _search.get("ots_parent_id")
-
-                        if is_similar(name,ots_name) or check_product(name,ots_name):
-                            name_ots_id = ots_id
-                            new_name = ots_name
-
-                            # #update alias of name
-                            # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
-                            # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
-                            # if _flag and _dpd.updateAlias(name):
-                            #     _dpd.update_row(self.ots_client)
-                            break
         if name_ots_id is not None:
 
             if brand is not None and brand!="":
 
                 s_brand = brand
                 l_brand = [brand]
-                l_brand.append(clean_product_brand(s_brand))
-                brand_ch = get_chinese_string(brand)
-                l_brand.extend(brand_ch)
+
+                Coll,_ = self.get_collection(BRAND_GRADE)
 
                 _find = False
                 for brand in l_brand:
 
-                    brand_vector = get_embedding_request(brand)
-                    if brand_vector is not None:
-                        Coll,_ = self.get_collection(BRAND_GRADE)
-                        search_list = get_embedding_search(Coll,embedding_index_name,brand,BRAND_GRADE,[brand_vector],self.search_params,output_fields,limit=60)
+                    if len(brand)>100:
+                        continue
+                    search_list = get_intellect_search(Coll,embedding_index_name,brand,BRAND_GRADE,self.search_params,output_fields,limit=10)
 
-                        # log("search brand %s"%(brand))
-                        for _search in search_list:
+                    # log("search brand %s"%(brand))
+                    for _search in search_list:
 
-                            ots_id = _search.get("standard_name_id")
-                            ots_name = _search.get("standard_name")
-                            ots_parent_id = _search.get("ots_parent_id")
+                        ots_id = _search.get("standard_name_id")
+                        ots_name = _search.get("ots_name")
+                        standard_name = _search.get("standard_name")
+                        ots_parent_id = _search.get("ots_parent_id")
+                        remove_words = _search.get("remove_words")
 
-                            # log("check brand %s and %s"%(brand,ots_name))
-                            if is_similar(brand,ots_name) or check_brand(brand,ots_name):
+                        # log("check brand %s and %s"%(brand,ots_name))
+                        if check_brand(brand,ots_name,remove_words):
 
-                                # log("check brand similar succeed:%s and %s"%(brand,ots_name))
+                            # log("check brand similar succeed:%s and %s"%(brand,ots_name))
 
-                                if ots_name==new_name:
+                            if ots_name==new_name:
+                                continue
+                            original_brand = brand
+                            if original_brand==original_name:
+                                if original_brand.find(ots_name)>=1:
                                     continue
-                                new_brand = ots_name
-
-                                log("checking brand %s succeed %s"%(brand,new_brand))
-                                # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
-
-                                if name_ots_id is not None:
-                                    brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
-
-                                    _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
-                                                DOCUMENT_PRODUCT_DICT_NAME:new_brand,
-                                                DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
-                                                DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
-                                                DOCUMENT_PRODUCT_DICT_STATUS:1,
-                                                DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
-                                                DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
-                                                DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                                                DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                                                }
-                                    _dpd_brand = Document_product_dict(_d_brand)
-                                    # _dpd_brand.updateAlias(str(new_brand).lower())
-                                    if not _dpd_brand.exists_row(self.ots_client):
-                                        _dpd_brand.update_row(self.ots_client)
-
-                                    else:
-                                        pass
-                                        # #update alias
-                                        # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
-                                        # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
-                                        # if _flag:
-                                        #     if _dpd.updateAlias(brand):
-                                        #         _dpd.update_row(self.ots_client)
+                                if len(original_brand)<=3:
+                                    continue
+                            new_brand = standard_name
+
+                            log("checking brand %s succeed %s"%(brand,new_brand))
+                            # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
+
+                            if name_ots_id is not None:
+                                brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
+
+                                _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
+                                            DOCUMENT_PRODUCT_DICT_NAME:new_brand,
+                                            DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_brand).lower()),
+                                            DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
+                                            DOCUMENT_PRODUCT_DICT_STATUS:1,
+                                            DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
+                                            DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
+                                            DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                            DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                            }
+                                _dpd_brand = Document_product_dict(_d_brand)
+                                # _dpd_brand.updateAlias(str(new_brand).lower())
+                                if not _dpd_brand.exists_row(self.ots_client):
+                                    _dpd_brand.update_row(self.ots_client)
 
-                                _find = True
-                                break
-                            else:
-                                # log("check brand similar failed:%s and %s"%(brand,ots_name))
-                                # add new brand?
-                                pass
-                        if _find:
+                                else:
+                                    pass
+                                    # #update alias
+                                    # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
+                                    # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
+                                    # if _flag:
+                                    #     if _dpd.updateAlias(brand):
+                                    #         _dpd.update_row(self.ots_client)
+
+                            _find = True
                             break
+                        else:
+                            # log("check brand similar failed:%s and %s"%(brand,ots_name))
+                            # add new brand?
+                            pass
+                    if _find:
+                        break
                 if not _find:
                     for brand in l_brand:
+                        if len(brand)>100:
+                            continue
                         if self.check_new_brand(brand):
                             new_brand = clean_product_brand(brand)
                             if new_brand=="":
                                 continue
+                            original_brand = brand
+                            if original_brand==original_name:
+                                if new_name==original_brand:
+                                    continue
+                                if original_brand.find(new_brand)>=1:
+                                    continue
+                                if len(original_brand)<=3:
+                                    continue
                             log("adding new brand %s"%(str(new_brand)))
                             _d_brand = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
                                         DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_brand,
-                                        DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(str(brand).lower()),
+                                        DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(str(new_brand).lower()),
                                         DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:BRAND_GRADE,
                                         DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
                                         DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:name_ots_id,
@@ -294,74 +332,74 @@ class Product_Manager(Product_Dict_Manager):
 
             if brand_ots_id is None:
                 _find = False
+                Coll,_ = self.get_collection(BRAND_GRADE)
                 for brand in list_candidates:
                     if _find:
                         break
                     l_brand = [brand]
-                    l_brand.append(clean_product_brand(brand))
-                    brand_ch = get_chinese_string(brand)
-                    l_brand.extend(brand_ch)
 
                     for brand in l_brand:
+                        if len(brand)>100:
+                            continue
                         if _find:
                             break
-                        start_time = time.time()
-                        # brand_vector = request_embedding(brand)
-                        brand_vector = get_embedding_request(brand)
-                        debug("get embedding for brand %s takes %.4fs"%(brand,time.time()-start_time))
-                        if brand_vector is not None:
-                            Coll,_ = self.get_collection(BRAND_GRADE)
-                            start_time = time.time()
-                            # search_list = search_embedding(Coll,embedding_index_name,[brand_vector],self.search_params,output_fields,limit=10)
-                            search_list = get_embedding_search(Coll,embedding_index_name,brand,BRAND_GRADE,[brand_vector],self.search_params,output_fields,limit=10)
-                            debug("get search_list for brand %s takes %.4fs"%(brand,time.time()-start_time))
-                            # log("search brand %s"%(brand))
-                            for _search in search_list:
-
-
-                                ots_id = _search.get("standard_name_id")
-                                ots_name = _search.get("standard_name")
-                                ots_parent_id = _search.get("ots_parent_id")
-
-                                # log("check brand %s and %s"%(brand,ots_name))
-                                if is_similar(brand,ots_name,_radio=95) or check_brand(brand,ots_name):
-                                    # log("check brand similar succeed:%s and %s"%(brand,ots_name))
-                                    if ots_name==new_name:
+
+                        search_list = get_intellect_search(Coll,embedding_index_name,brand,BRAND_GRADE,self.search_params,output_fields,limit=10)
+                        # log("search brand %s"%(brand))
+                        for _search in search_list:
+
+                            ots_id = _search.get("standard_name_id")
+                            ots_name = _search.get("ots_name")
+                            standard_name = _search.get("standard_name")
+                            ots_parent_id = _search.get("ots_parent_id")
+                            remove_words = _search.get("remove_words")
+
+                            # log("check brand %s and %s"%(brand,ots_name))
+                            if check_brand(brand,ots_name,remove_words):
+                                # log("check brand similar succeed:%s and %s"%(brand,ots_name))
+                                if ots_name==new_name:
+                                    continue
+
+                                original_brand = brand
+                                if original_brand==original_name:
+                                    if original_brand.find(ots_name)>=1:
                                         continue
-                                    new_brand = ots_name
+                                    if len(original_brand)<=3:
+                                        continue
+                                new_brand = standard_name
 
-                                    log("checking brand %s succeed %s"%(brand,new_brand))
-                                    # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
+                                log("checking brand %s succeed %s"%(brand,new_brand))
+                                # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
 
-                                    if name_ots_id is not None:
-                                        brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
+                                if name_ots_id is not None:
+                                    brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
 
-                                        _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
-                                                    DOCUMENT_PRODUCT_DICT_NAME:new_brand,
-                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
-                                                    DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
-                                                    DOCUMENT_PRODUCT_DICT_STATUS:1,
-                                                    DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
-                                                    DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
-                                                    DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                                                    DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                                                    }
-                                        _dpd_brand = Document_product_dict(_d_brand)
-                                        # _dpd_brand.updateAlias(str(new_brand).lower())
-                                        if not _dpd_brand.exists_row(self.ots_client):
-                                            _dpd_brand.update_row(self.ots_client)
+                                    _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
+                                                DOCUMENT_PRODUCT_DICT_NAME:new_brand,
+                                                DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_brand).lower()),
+                                                DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
+                                                DOCUMENT_PRODUCT_DICT_STATUS:1,
+                                                DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
+                                                DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
+                                                DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                                DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                                }
+                                    _dpd_brand = Document_product_dict(_d_brand)
+                                    # _dpd_brand.updateAlias(str(new_brand).lower())
+                                    if not _dpd_brand.exists_row(self.ots_client):
+                                        _dpd_brand.update_row(self.ots_client)
 
-                                        else:
-                                            pass
-                                            # #update alias
-                                            # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
-                                            # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
-                                            # if _flag:
-                                            #     if _dpd.updateAlias(brand):
-                                            #         _dpd.update_row(self.ots_client)
+                                    else:
+                                        pass
+                                        # #update alias
+                                        # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
+                                        # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
+                                        # if _flag:
+                                        #     if _dpd.updateAlias(brand):
+                                        #         _dpd.update_row(self.ots_client)
 
-                                    _find = True
-                                    break
+                                _find = True
+                                break
 
             if specs is not None and specs!="":
 
@@ -374,9 +412,9 @@ class Product_Manager(Product_Dict_Manager):
                 for s in re.split("[\u4e00-\u9fff]",specs):
                     if s!="" and len(s)>4:
                         list_specs.append(s)
-                similar_flag = None
                 _index = 0
                 break_flag = False
+                list_similar_specs = []
                 for c_specs in list_specs:
                     if break_flag:
                         break
@@ -385,12 +423,13 @@ class Product_Manager(Product_Dict_Manager):
 
                     if specs_vector is not None:
                         Coll,_ = self.get_collection(SPECS_GRADE)
-                        search_list = get_embedding_search(Coll,embedding_index_name,c_specs,SPECS_GRADE,[specs_vector],self.search_params,output_fields,limit=60)
+                        search_list = get_embedding_search(Coll,embedding_index_name,c_specs,SPECS_GRADE,[specs_vector],self.search_params,output_fields,limit=20)
 
                         for _search in search_list:
 
                             ots_id = _search.get("standard_name_id")
-                            ots_name = _search.get("standard_name")
+                            ots_name = _search.get("ots_name")
+                            standard_name = _search.get("standard_name")
                             ots_parent_id = _search.get("ots_parent_id")
 
                             debug("checking specs %s and %s"%(specs,ots_name))
@@ -398,7 +437,10 @@ class Product_Manager(Product_Dict_Manager):
                                 # log("specs is_similar")
                                 if check_specs(c_specs,ots_name):
                                     break_flag = True
-                                    new_specs = ots_name
+                                    original_specs = c_specs
+                                    if standard_name==new_name:
+                                        continue
+                                    new_specs = standard_name
                                     log("check_specs %s succeed %s"%(specs,new_specs))
 
                                     # to update the document_product_dict which is builded for search
@@ -408,7 +450,7 @@ class Product_Manager(Product_Dict_Manager):
 
                                         _d_specs = {DOCUMENT_PRODUCT_DICT_ID:specs_ots_id,
                                                     DOCUMENT_PRODUCT_DICT_NAME:new_specs,
-                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(specs).lower()),
+                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_specs).lower()),
                                                     DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
                                                     DOCUMENT_PRODUCT_DICT_STATUS:1,
                                                     DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
@@ -429,51 +471,60 @@ class Product_Manager(Product_Dict_Manager):
                                             # if _flag:
                                             #     if _dpd.updateAlias(specs):
                                             #         _dpd.update_row(self.ots_client)
+                                    break_flag = True
                                     break
-                                else:
-                                    if _index == 1:
-                                        similar_flag = True
-
+                            else:
+                                list_similar_specs.append(specs)
                 # add new specs?
-                debug("specs not similar")
-                if is_legal_specs(specs) and len(specs)<MAX_NAME_LENGTH and len(specs)>=5:
-                    debug("is_legal_specs")
-                    new_specs = clean_product_specs(specs)
-                    # insert into document_product_dict a new record
-                    # to update the document_product_dict which is builded for search
-                    # add new specs
-                    if brand_ots_id is not None and name_ots_id is not None:
-                        _md5 = get_document_product_dict_id(brand_ots_id,new_specs)
-
-                        # _d = {DOCUMENT_PRODUCT_DICT_ID:_md5,
-                        #       DOCUMENT_PRODUCT_DICT_NAME:new_specs,
-                        #       DOCUMENT_PRODUCT_DICT_ALIAS:"%s&&%s"%(specs,new_specs),
-                        #       DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
-                        #       DOCUMENT_PRODUCT_DICT_STATUS:1,
-                        #       DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
-                        #       DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                        #       DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                        #       }
-                        # _dpd = Document_product_dict(_d)
-                        # _dpd.update_row(self.ots_client)
-
-                        log("adding new specs %s"%(new_specs))
-                        # user interface to add
-                        _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
-                              DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_specs,
-                              DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(new_specs.lower()),
-                              DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:SPECS_GRADE,
-                              DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
-                              DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:brand_ots_id,
-                              DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                              DOCUMENT_PRODUCT_DICT_INTERFACE_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-                              DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"insert"
-                              }
-                        _dpdi = Document_product_dict_interface(_d)
-                        _dpdi.update_row(self.ots_client)
+                if new_specs is not None and new_specs!="":
+                    pass
+                else:
+                    debug("specs not similar")
+                    for specs in list_similar_specs:
+                        if is_legal_specs(specs) and len(specs)<MAX_NAME_LENGTH and len(specs)>=5:
+                            debug("is_legal_specs")
+                            original_specs = specs
+
+                            new_specs = clean_product_specs(specs)
+                            if new_specs==new_name:
+                                new_specs = ""
+                                continue
+                            # insert into document_product_dict a new record
+                            # to update the document_product_dict which is builded for search
+                            # add new specs
+                            if brand_ots_id is not None and name_ots_id is not None:
+                                specs_ots_id = get_document_product_dict_id(brand_ots_id,new_specs)
+
+                                # _d = {DOCUMENT_PRODUCT_DICT_ID:_md5,
+                                #       DOCUMENT_PRODUCT_DICT_NAME:new_specs,
+                                #       DOCUMENT_PRODUCT_DICT_ALIAS:"%s&&%s"%(specs,new_specs),
+                                #       DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
+                                #       DOCUMENT_PRODUCT_DICT_STATUS:1,
+                                #       DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
+                                #       DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                #       DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                #       }
+                                # _dpd = Document_product_dict(_d)
+                                # _dpd.update_row(self.ots_client)
+
+                                log("adding new specs %s"%(new_specs))
+                                # user interface to add
+                                _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
+                                      DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_specs,
+                                      DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(new_specs.lower()),
+                                      DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:SPECS_GRADE,
+                                      DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
+                                      DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:brand_ots_id,
+                                      DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                      DOCUMENT_PRODUCT_DICT_INTERFACE_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                      DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"insert"
+                                      }
+                                _dpdi = Document_product_dict_interface(_d)
+                                _dpdi.update_row(self.ots_client)
+                            break
         if specs_ots_id is None:
             _find = False
-            for specs in list_candidates:
+            for specs in list_candidate_brand_specs:
                 if _find:
                     break
 
@@ -495,29 +546,34 @@ class Product_Manager(Product_Dict_Manager):
 
                     if specs_vector is not None:
                         Coll,_ = self.get_collection(SPECS_GRADE)
-                        search_list = get_embedding_search(Coll,embedding_index_name,c_specs,SPECS_GRADE,[specs_vector],self.search_params,output_fields,limit=20)
+                        search_list = get_embedding_search(Coll,embedding_index_name,c_specs,SPECS_GRADE,[specs_vector],self.search_params,output_fields,limit=10)
 
                         for _search in search_list:
                             if _find:
                                 break
 
                             ots_id = _search.get("standard_name_id")
-                            ots_name = _search.get("standard_name")
+                            ots_name = _search.get("ots_name")
+                            standard_name = _search.get("standard_name")
                             ots_parent_id = _search.get("ots_parent_id")
 
                             debug("checking specs %s and %s"%(specs,ots_name))
-                            if is_similar(specs,ots_name):
+                            if is_similar(c_specs,ots_name):
                                 # log("specs is_similar")
                                 if check_specs(c_specs,ots_name):
                                     break_flag = True
-                                    new_specs = ots_name
+                                    original_specs = c_specs
+                                    new_specs = standard_name
+                                    if new_specs==new_name:
+                                        new_specs = ""
+                                        continue
                                     if brand_ots_id is not None:
                                         # judge if the specs which parent_id is brand_ots_id exists,insert one if not exists else update alias
                                         specs_ots_id = get_document_product_dict_id(brand_ots_id,new_specs)
 
                                         _d_specs = {DOCUMENT_PRODUCT_DICT_ID:specs_ots_id,
                                                     DOCUMENT_PRODUCT_DICT_NAME:new_specs,
-                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(specs).lower()),
+                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(new_specs).lower()),
                                                     DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
                                                     DOCUMENT_PRODUCT_DICT_STATUS:1,
                                                     DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
@@ -557,11 +613,12 @@ class Product_Manager(Product_Dict_Manager):
                 if unit_price>0:
                     new_quantity = total_price/unit_price
                     if new_quantity!=quantity:
-                        if new_quantity==total_price//unit_price:
-                            quantity = int(new_quantity)
-                            _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
-                        else:
-                            is_legal_data = False
+                        # if new_quantity==total_price//unit_price:
+                        #     quantity = int(new_quantity)
+                        #     _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
+                        # else:
+                        #     is_legal_data = False
+                        is_legal_data = False
                 elif quantity>0:
                     unit_price = total_price/quantity
                     _product.setValue(DOCUMENT_PRODUCT_UNIT_PRICE,unit_price,True)
@@ -610,9 +667,9 @@ class Product_Manager(Product_Dict_Manager):
 
             _product.setValue(DOCUMENT_PRODUCT_CREATE_TIME,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
 
-            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,""),True)
-            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,""),True)
-            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,""),True)
+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,original_name,True)
+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,original_brand,True)
+            _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,original_specs,True)
 
             bid_filemd5s = self.get_bid_filemd5s(docid,self.ots_client)
             if bid_filemd5s is not None:
@@ -621,14 +678,16 @@ class Product_Manager(Product_Dict_Manager):
             if not is_legal_data:
                 _status = randint(501,550)
 
-            elif self.dumplicate(_product):
-                _status = randint(201,300)
-                save_product_tmp.setValue(DOCUMENT_PRODUCT_TMP_NEW_ID,new_id,True)
-
-                _product.update_row(self.ots_client)
-
             else:
-                _status = randint(451,500)
+                _flag,dump_id = self.dumplicate(_product)
+                if _flag:
+                    _status = randint(201,300)
+                    save_product_tmp.setValue(DOCUMENT_PRODUCT_TMP_NEW_ID,new_id,True)
+
+                    _product.update_row(self.ots_client)
+                else:
+                    _status = randint(451,500)
+                    save_product_tmp.setValue(DOCUMENT_PRODUCT_DUMP_ID,str(dump_id),True)
 
         else:
             _status = randint(401,450)
@@ -692,11 +751,11 @@ class Product_Manager(Product_Dict_Manager):
     def get_value_count(self,name,brand,specs,unit_price,quantity):
 
         value_count = 0
-        if len(name)>0:
+        if name is not None and len(name)>0:
             value_count += 1
-        if len(brand)>0:
+        if brand is not None and len(brand)>0:
             value_count += 1
-        if len(specs)>0:
+        if specs is not None and len(specs)>0:
             value_count += 1
         if isinstance(unit_price,(float,int)) and unit_price>0:
             value_count += 1
@@ -716,7 +775,8 @@ class Product_Manager(Product_Dict_Manager):
         tenderee = str(document_product.getProperties().get(DOCUMENT_PRODUCT_TENDEREE,""))
         supplier = str(document_product.getProperties().get(DOCUMENT_PRODUCT_SUPPLIER,""))
 
-
+        base_value_count = self.get_value_count(name,brand,specs,unit_price,quantity)
+        list_dump_id = []
         page_time_before = page_time
         page_time_after = page_time
         try:
@@ -725,6 +785,8 @@ class Product_Manager(Product_Dict_Manager):
         except Exception as e:
             pass
 
+        to_save = 1
+
         if len(name)>0 and len(brand)>0 and len(specs)>0 and isinstance(unit_price,(float,int)) and isinstance(quantity,(float,int)):
             bool_query = BoolQuery(must_queries=[TermQuery("name",name),
                                                  RangeQuery("page_time",page_time_before,page_time_after,True,True),
@@ -734,12 +796,69 @@ class Product_Manager(Product_Dict_Manager):
                                                  TermQuery("quantity",quantity)
                                                  ])
 
-            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product","document_product_index",
+            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
                                                                                 SearchQuery(bool_query,limit=1),
                                                                                 columns_to_get=ColumnsToGet(["name",'brand','specs'],return_type=ColumnReturnType.SPECIFIED))
             list_data = getRow_ots(rows)
             if len(list_data)>0:
-                return list_data[0].get(DOCUMENT_PRODUCT_ID),1
+                return list_data[0].get(DOCUMENT_PRODUCT_ID),0
+
+        bool_query = BoolQuery(must_queries=[
+            TermQuery(project_docids,str(docid)),
+        ])
+        rows,next_token,total_count,is_all_succeed = self.ots_client.search("project2","project2_index",
+                                                                            SearchQuery(bool_query,limit=10),
+                                                                            ColumnsToGet([project_docids],return_type=ColumnReturnType.SPECIFIED))
+
+        list_data = getRow_ots(rows)
+        set_docid = set()
+        for _data in list_data:
+            _docids = _data.get(project_docids,"")
+            for d_id in _docids.split(","):
+                d_id = d_id.strip()
+                if d_id!="":
+                    set_docid.add(int(d_id))
+        if docid in set_docid:
+            set_docid.remove(docid)
+        should_q = [TermQuery(DOCUMENT_PRODUCT_DOCID,did) for did in set_docid]
+        if len(should_q)>0:
+            bool_query = BoolQuery(must_queries=[TermQuery("name",name),
+                                                 BoolQuery(should_queries=should_q),
+                                                 ])
+            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                                SearchQuery(bool_query,limit=50),
+                                                                                columns_to_get=ColumnsToGet(["docid",'name','brand','specs','unit_price','quantity'],return_type=ColumnReturnType.SPECIFIED))
+            list_data = getRow_ots(rows)
+            dict_docid_name = {}
+            match_ids = []
+            for _data in list_data:
+                docid1 = _data.get(DOCUMENT_PRODUCT_DOCID)
+                name1 = _data.get(DOCUMENT_PRODUCT_NAME)
+                brand1 = _data.get(DOCUMENT_PRODUCT_BRAND)
+                specs1 = _data.get(DOCUMENT_PRODUCT_SPECS)
+                unit_price1 = _data.get(DOCUMENT_PRODUCT_UNIT_PRICE)
+                quantity1 = _data.get(DOCUMENT_PRODUCT_QUANTITY)
+                id = _data.get(DOCUMENT_PRODUCT_ID)
+                value_count1 = self.get_value_count(name1,brand1,specs1,unit_price1,quantity1)
+                if name1==name:
+                    match_ids.append({DOCUMENT_PRODUCT_ID:id,"value_count":value_count1})
+                    if docid1 not in dict_docid_name:
+                        dict_docid_name[docid1] = []
+                    dict_docid_name[docid1].append(name)
+            is_all_one = True
+            for k,v in dict_docid_name.items():
+                if len(v)!=1:
+                    is_all_one = False
+            if is_all_one:
+                match_ids.sort(key=lambda x:x.get("value_count",0),reverse=True)
+                if len(match_ids)>0:
+                    _id = match_ids[0].get(DOCUMENT_PRODUCT_ID)
+                    value_count1 = match_ids[0]["value_count"]
+                    if base_value_count<value_count1:
+                        to_save = 0
+                    for _match in match_ids:
+                        list_dump_id.append(_match.get(DOCUMENT_PRODUCT_ID))
+
 
         if len(name)>0 and len(brand)>0 and len(supplier)>0 and len(tenderee)>0:
             # log("docid %s name %s page_time_before %s page_time_after %s brand %s supplier %s tenderee %s"%(str(docid),name,page_time_before,page_time_after,brand,supplier,tenderee))
@@ -750,11 +869,11 @@ class Product_Manager(Product_Dict_Manager):
                                                  TermQuery(DOCUMENT_PRODUCT_SUPPLIER,supplier),
                                                  ])
 
-            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product","document_product_index",
+            rows,next_token,total_count,is_all_succeed = self.ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
                                                                                 SearchQuery(bool_query,limit=50),
                                                                                 columns_to_get=ColumnsToGet(['name','brand','specs','unit_price','quantity'],return_type=ColumnReturnType.SPECIFIED))
             list_data = getRow_ots(rows)
-            value_count = self.get_value_count(name,brand,specs,unit_price,quantity)
+
 
             for _d in list_data:
                 s_id = _d.get(DOCUMENT_PRODUCT_ID)
@@ -773,12 +892,10 @@ class Product_Manager(Product_Dict_Manager):
                     check_flag = False
 
                 if check_flag:
-                    if value_count<value_count1:
+                    if base_value_count<value_count1:
                         to_save = 0
-                    else:
-                        to_save = 1
-                    return s_id,to_save
-        return None,1
+                    list_dump_id.append(s_id)
+        return list_dump_id,to_save
 
 
     def dumplicate(self,document_product):
@@ -791,18 +908,27 @@ class Product_Manager(Product_Dict_Manager):
         dump_id,to_save = self.dumplicate_search_product(document_product)
 
         if dump_id is not None:
-            document_product.setValue(DOCUMENT_PRODUCT_DUMP_ID,dump_id,True)
+            document_product.setValue(DOCUMENT_PRODUCT_DUMP_ID,str(dump_id),True)
 
         if to_save==1:
             if dump_id is not None:
-                _d = {DOCUMENT_PRODUCT_ID:dump_id,
-                      DOCUMENT_PRODUCT_STATUS:randint(401,450),
-                      DOCUMENT_PRODUCT_DUMP_ID:document_product.getProperties().get(DOCUMENT_PRODUCT_ID)}
-                _dp = Document_product(_d)
-                _dp.update_row(self.ots_client)
-            return True
+                if isinstance(dump_id,str):
+                    _d = {DOCUMENT_PRODUCT_ID:dump_id,
+                          DOCUMENT_PRODUCT_STATUS:randint(401,450),
+                          DOCUMENT_PRODUCT_DUMP_ID:document_product.getProperties().get(DOCUMENT_PRODUCT_ID)}
+                    _dp = Document_product(_d)
+                    _dp.update_row(self.ots_client)
+                elif isinstance(dump_id,list):
+                    for d_id in dump_id:
+                        _d = {DOCUMENT_PRODUCT_ID:d_id,
+                              DOCUMENT_PRODUCT_STATUS:randint(401,450),
+                              DOCUMENT_PRODUCT_DUMP_ID:document_product.getProperties().get(DOCUMENT_PRODUCT_ID)}
+                        _dp = Document_product(_d)
+                        _dp.update_row(self.ots_client)
+
+            return True,dump_id
         else:
-            return False
+            return False,dump_id
 
     def start_processing(self):
         scheduler = BlockingScheduler()
@@ -848,11 +974,23 @@ def fix_product_data():
     '''
     table_name = "document_product_temp"
     table_index = "document_product_temp_index"
-    columns = [DOCUMENT_PRODUCT_TMP_WIN_BID_PRICE]
+    columns = [DOCUMENT_PRODUCT_TMP_NEW_ID,DOCUMENT_PRODUCT_TMP_STATUS]
+
+
+    table_name = Document_product_table_name
+    table_index = Document_product_table_name+"_index"
+    columns = [DOCUMENT_PRODUCT_ORIGINAL_ID]
+
+
     ots_client = getConnect_ots()
-    bool_query = BoolQuery(must_queries=[
-        RangeQuery("status",501),
+    bool_query = BoolQuery(should_queries=[
+        # RangeQuery("status",501),
         # TermQuery("docid",246032980)
+
+        RangeQuery("status",401,501),
+        # RangeQuery("status",401,451)
+        # WildcardQuery(DOCUMENT_PRODUCT_ORIGINAL_SPECS,"MFUSOne")
+        # TermQuery(DOCUMENT_PRODUCT_SPECS,"MFUSOne")
     ])
 
     rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
@@ -860,6 +998,7 @@ def fix_product_data():
                                                                    columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
 
     list_rows = getRow_ots(rows)
+    print(total_count)
     while next_token:
         rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
                                                                        SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
@@ -876,6 +1015,8 @@ def fix_product_data():
     def fix_missing_data(item,result_queue):
 
         original_id = item.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+
+        print("original_id",original_id)
         _d = {DOCUMENT_PRODUCT_TMP_ID:original_id,DOCUMENT_PRODUCT_TMP_STATUS:1}
         dpt = Document_product_tmp(_d)
         dpt.fix_columns(ots_client,["name","brand","specs"],True)
@@ -905,15 +1046,23 @@ def fix_product_data():
 
     def deleteAndReprocess(item,result_queue):
 
+        original_id = item.get(DOCUMENT_PRODUCT_TMP_ID)
+        new_id = item.get(DOCUMENT_PRODUCT_TMP_NEW_ID)
+
         original_id = item.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
+        new_id = item.get(DOCUMENT_PRODUCT_ID)
+
+        print("original_id",original_id,"id",item.get(DOCUMENT_PRODUCT_ID))
         # delete data and rerun
         _d = {DOCUMENT_PRODUCT_TMP_ID:original_id,DOCUMENT_PRODUCT_TMP_STATUS:1}
         dpt = Document_product_tmp(_d)
         dpt.update_row(ots_client)
 
-        _d = {DOCUMENT_PRODUCT_ID:item.get(DOCUMENT_PRODUCT_ID)}
-        dp = Document_product(_d)
-        dp.delete_row(ots_client)
+
+        if new_id is not None and new_id!="":
+            _d = {DOCUMENT_PRODUCT_ID:new_id}
+            dp = Document_product(_d)
+            dp.delete_row(ots_client)
 
     def handle(item,result_queue):
         win_bid_price = item.get(DOCUMENT_PRODUCT_TMP_WIN_BID_PRICE,1)
@@ -922,8 +1071,7 @@ def fix_product_data():
             dpt.setValue(DOCUMENT_PRODUCT_TMP_STATUS,1,True)
             dpt.update_row(ots_client)
 
-
-    mt = MultiThreadHandler(task_queue,handle,None,30,1)
+    mt = MultiThreadHandler(task_queue,deleteAndReprocess,None,30,1)
     mt.run()
 
 def test_check_brand():
@@ -970,37 +1118,373 @@ def test_check_brand():
         else:
             brand = _d.get("brand")
             list_illegal_brand.append(brand)
-    with open("legal_brand.txt","w",encoding="utf8") as f:
+    with open("../../test/legal_brand.txt", "w", encoding="utf8") as f:
         for b in list_legal_brand:
             f.write(b+"\n")
-    with open("illegal_brand.txt","w",encoding="utf8") as f:
+    with open("../../test/illegal_brand.txt", "w", encoding="utf8") as f:
         for b in list_illegal_brand:
             f.write(b+"\n")
 
 def test_match():
-    a = "Mini-7"
-    vector = request_embedding(a)
+    a = "桂林市啄木鸟医疗器械有限公司"
+
+    # vector = request_embedding(get_milvus_standard_name(a))
+    # vector = [get_embedding_request(b) for b in a]
+    pm = Product_Manager()
+    _GRADE = BRAND_GRADE
+    Coll,_ = pm.get_collection(_GRADE)
+    print(Coll.name)
+
+    output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id","remove_words","level"]
+    # start_time = time.time()
+    _id = get_milvus_product_dict_id(a)
+    print(Coll.query(expr=" ots_id in ['%s'] "%(_id),output_fields=output_fields))
+    # print("cost",time.time()-start_time)
+    # print(Coll.compact())
+    # result = search_embedding(Coll,embedding_index_name,[vector],pm.search_params,output_fields,limit=20)
+    #
+    # final_list = []
+    # for _search in result:
+    #     _d = {}
+    #     for k in output_fields:
+    #         _d[k] = _search.entity.get(k)
+    #     final_list.append(_d)
+    # final_list = remove_repeat_item(final_list,k="ots_name")
+
+    start_time = time.time()
+    # final_list = get_embedding_search(Coll,embedding_index_name,a,_GRADE,vector,pm.search_params,output_fields,limit=5)
+    final_list = get_intellect_search(Coll,embedding_index_name,a,_GRADE,pm.search_params,output_fields,limit=10)
+    for _search in final_list:
+        ots_id = _search.get("standard_name_id")
+        ots_name = _search.get("ots_name")
+        standard_name = _search.get("standard_name")
+        ots_parent_id = _search.get("ots_parent_id")
+        remove_words = _search.get("remove_words")
+        if check_brand(a,ots_name,remove_words):
+            print("similar",a,ots_name)
+        else:
+            print("not similar",a,ots_name)
+
+    print("cost",time.time()-start_time)
+    print(final_list)
+
+
+def rebuild_milvus():
+
+    pdm = Product_Dict_Manager()
+    from multiprocessing import Queue as PQueue
+    bool_query = BoolQuery(must_queries=[
+        RangeQuery(DOCUMENT_PRODUCT_DICT_GRADE,3)
+    ])
+    ots_client = getConnect_ots()
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("name")]),limit=100,get_total_count=True),
+                                                                   ColumnsToGet([DOCUMENT_PRODUCT_DICT_GRADE,DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS],return_type=ColumnReturnType.SPECIFIED))
+
+    list_data = getRow_ots(rows)
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                       ColumnsToGet([DOCUMENT_PRODUCT_DICT_GRADE,DOCUMENT_PRODUCT_DICT_NAME,DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS],return_type=ColumnReturnType.SPECIFIED))
+
+        list_data.extend(getRow_ots(rows))
+        print("%d/%d"%(len(list_data),total_count))
+
+        # if len(list_data)>1000:
+        #     break
+
+    set_name_grade = set()
+    task_queue = PQueue()
+    for _data in list_data:
+        name = _data.get(DOCUMENT_PRODUCT_DICT_NAME)
+        grade = _data.get(DOCUMENT_PRODUCT_DICT_GRADE)
+        _key = "%s--%d"%(name,grade)
+        if _key not in set_name_grade:
+            task_queue.put(_data)
+        set_name_grade.add(_key)
+
+    log("rebuild milvus %d counts"%(task_queue.qsize()))
+    def insert_into_milvus(item,result_queue):
+        name = item.get(DOCUMENT_PRODUCT_DICT_NAME,"")
+        grade = item.get(DOCUMENT_PRODUCT_DICT_GRADE)
+
+        if grade==SPECS_GRADE:
+            name = clean_product_specs(name)
+            if len(name)<2:
+                return
+        if len(name)<2:
+            return
+
+
+        parent_id = item.get(DOCUMENT_PRODUCT_DICT_PARENT_ID,"")
+
+        Coll,_ = pdm.get_collection(grade)
+        standard_alias = item.get(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,"")
+
+        log("insert name %s grade %d"%(name,grade))
+        remove_words = item.get(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,"")
+        level = item.get(DOCUMENT_PRODUCT_DICT_LEVEL)
+        if level is None:
+            if re.search("装置|设备",name) is not None:
+                level = 2
+            else:
+                level = 1
+        insert_new_record_to_milvus(Coll,name,grade,parent_id,standard_alias,remove_words,level)
+
+    def start_thread():
+        mt = MultiThreadHandler(task_queue,insert_into_milvus,None,5)
+        mt.run()
+    p_count = 5
+    list_p = []
+    for i in range(p_count):
+        p = Process(target=start_thread)
+        list_p.append(p)
+    for p in list_p:
+        p.start()
+    for p in list_p:
+        p.join()
+
+def move_document_product():
+    bool_query = BoolQuery(must_queries=[
+        ExistsQuery(DOCUMENT_PRODUCT_NAME)
+    ])
+    ots_client = getConnect_ots()
+    Document_product_table_name = "document_product"
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("name")]),limit=100,get_total_count=True),
+                                                                   ColumnsToGet(return_type=ColumnReturnType.ALL))
+    list_data = getRow_ots(rows)
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                       ColumnsToGet(return_type=ColumnReturnType.ALL))
+        list_data.extend(getRow_ots(rows))
+        print("%d/%d"%(len(list_data),total_count))
+        # if len(list_data)>=1000:
+        #     break
+
+    task_queue = Queue()
+
+    for _data in list_data:
+        task_queue.put(_data)
+
+    def _handle(item,result_queue):
+
+        D1 = Document_product(item)
+        D1.update_row(ots_client)
+
+        D1.table_name = Document_product_table_name
+        D1.delete_row(ots_client)
+
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+
+current_path = os.path.dirname(__file__)
+def delete_brands():
+    filename = os.path.join(current_path,"illegal_brand.txt")
+
+    ots_client = getConnect_ots()
+    list_brand = []
+    with open(filename,"r",encoding="utf8") as f:
+        while 1:
+            brand = f.readline()
+            if not brand:
+                break
+            brand = brand.strip()
+            list_brand.append(brand)
+
     pm = Product_Manager()
-    Coll,_ = pm.get_collection(NAME_GRADE)
-    output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]
-    search_list = search_embedding(Coll,embedding_index_name,[vector],pm.search_params,output_fields,limit=60)
-    print(search_list)
+    Coll,_ = pm.get_collection(BRAND_GRADE)
+
+    print(Coll.name)
+    Coll.compact()
+    _count = 0
+
+    task_queue = Queue()
+    for brand in list_brand:
+        _count += 1
+        task_queue.put(brand)
+        # if _count>=2:
+        #     break
+
+    def _handle(brand,result_queue):
+
+        bool_query = BoolQuery(must_queries=[
+            TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,BRAND_GRADE),
+            TermQuery(DOCUMENT_PRODUCT_DICT_NAME,brand)
+        ])
+
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                       ColumnsToGet(return_type=ColumnReturnType.NONE))
+        list_data = getRow_ots(rows)
+        _id = get_milvus_product_dict_id(brand)
+
+        while next_token:
+            rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                           ColumnsToGet(return_type=ColumnReturnType.NONE))
+            list_data.extend(getRow_ots(rows))
+        for _d in list_data:
+            dpd = Document_product_dict(_d)
+            dpd.delete_row(ots_client)
+        # print(Coll.query(expr=" ots_id in ['%s']"%(_id),output_fields=["ots_id","ots_name"]))
+        delete_counts = Coll.delete(expr=" ots_id in ['%s']"%(_id)).delete_count
+
+        log("brand %s total_count %d md5:%s delete_counts:%d"%(brand,total_count,_id,delete_counts))
+
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+
+
+
+def delete_specs():
+    filename = os.path.join(current_path,"illegal_specs.txt")
+
+    ots_client = getConnect_ots()
+    list_brand = []
+    with open(filename,"r",encoding="utf8") as f:
+        while 1:
+            brand = f.readline()
+            if not brand:
+                break
+            brand = brand.strip()
+            list_brand.append(brand)
+
+    pm = Product_Manager()
+    Coll,_ = pm.get_collection(SPECS_GRADE)
+    print(Coll.name)
+    Coll.compact()
+
+    _count = 0
+    task_queue = Queue()
+
+    for specs in list_brand:
+        task_queue.put(specs)
+        _count += 1
+        # if _count>=2:
+        #     break
+
+    def _handle(specs,result_queue):
+
+        bool_query = BoolQuery(must_queries=[
+            TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,SPECS_GRADE),
+            TermQuery(DOCUMENT_PRODUCT_DICT_NAME,specs)
+        ])
+
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
+                                                                       ColumnsToGet(return_type=ColumnReturnType.NONE))
+        list_data = getRow_ots(rows)
+        _id = get_milvus_product_dict_id(specs)
+
+        while next_token:
+            rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                           ColumnsToGet(return_type=ColumnReturnType.NONE))
+            list_data.extend(getRow_ots(rows))
+        for _d in list_data:
+            dpd = Document_product_dict(_d)
+            dpd.delete_row(ots_client)
+        # print(Coll.query(expr=" ots_id in ['%s']"%(_id),output_fields=["ots_id","ots_name"]))
+        delete_counts = Coll.delete(expr=" ots_id in ['%s']"%(_id)).delete_count
+
+        log("brand %s total_count %d md5:%s delete_counts:%d"%(specs,total_count,_id,delete_counts))
+
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+    Coll.compact()
+
+def remove_redis_keys():
+    db = redis.Redis(connection_pool=pool_product)
+    db.flushdb()
+
+
+def update_document_product_dict():
+    import pandas as pd
+    filename = "update_product.csv"
+    df = pd.read_csv(filename,encoding="gbk")
+    ots_client = getConnect_ots()
+    for name,grade,standard_alias,remove_words,level in zip(df["name"],df["grade"],df["standard_alias"],df["remove_words"],df["level"]):
+        name = name.strip()
+        bool_query = BoolQuery(must_queries=[
+            TermQuery(DOCUMENT_PRODUCT_DICT_NAME,name),
+            TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,grade)
+        ])
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
+                                                                       SearchQuery(bool_query,get_total_count=True),
+                                                                       ColumnsToGet(return_type=ColumnReturnType.NONE))
+        if total_count==1:
+            list_data = getRow_ots(rows)
+            _data = list_data[0]
+            dpd = Document_product_dict(_data)
+            level = 1
+            if re.search("器械|设备|其他",name) is not None and level==1:
+                level = 2
+            if str(remove_words)=="nan":
+                remove_words = ""
+            dpd.setValue(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS,standard_alias,True)
+            dpd.setValue(DOCUMENT_PRODUCT_DICT_REMOVE_WORDS,remove_words,True)
+            dpd.setValue(DOCUMENT_PRODUCT_DICT_LEVEL,level,True)
+            dpd.setValue(DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED,IS_SYNCHONIZED+1,True)
+            dpd.update_row(ots_client)
+            print(dpd.getProperties())
+
 
 
 def test():
     # pm = Product_Manager()
     # pm.test()
-    fix_product_data()
+    # fix_product_data()
     # test_check_brand()
-    # test_match()
+    test_match()
+    # rebuild_milvus()
+
+    # move_document_product()
+    # delete_brands()
+    # delete_specs()
+    # remove_redis_keys()
+    # update_document_product_dict()
+
+def clean_product_dict_interface():
+    ots_client = getConnect_ots()
+    bool_query = BoolQuery(must_queries=[
+        BoolQuery(should_queries=[
+            TermQuery("action","insert"),
+            TermQuery("action","base")
+        ])
+    ])
+    task_queue = Queue()
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),get_total_count=True,limit=100),
+                                                                   columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+    list_data = getRow_ots(rows)
+    for _data in list_data:
+        task_queue.put(_data)
+    print("%d/%d"%(task_queue.qsize(),total_count))
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
+                                                                       columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
+        list_data = getRow_ots(rows)
+        for _data in list_data:
+            task_queue.put(_data)
+        print("%d/%d"%(task_queue.qsize(),total_count))
+
+    def _handle(item,result_queue):
+        _dpd = Document_product_dict_interface(item)
+        _dpd.delete_row(ots_client)
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
 
 if __name__ == '__main__':
 
+    # test()
     # start_process_product()
     # print(getMD5('11936c56f2dd1426764e317ca2e8e1a7'+'&&鱼跃'))
-    test()
-    print(Product_Manager.get_bid_filemd5s(155415770,getConnect_ots()))
-    name = "一"
-    ots_name = "一氧化碳分析仪"
-    print(is_similar(name,ots_name),check_product(name,ots_name))
-    print(is_legal_specs('SCM-A/SB(0.18D)'))
+    # print(Product_Manager.get_bid_filemd5s(155415770,getConnect_ots()))
+    # name = "一"
+    # ots_name = "一氧化碳分析仪"
+    # print(is_similar(name,ots_name),check_product(name,ots_name))
+    # print(is_legal_specs('SCM-A/SB(0.18D)'))
+    clean_product_dict_interface()

+ 29 - 0
BaseDataMaintenance/maxcompute/attachmentRec.py

@@ -84,6 +84,33 @@ class f_getPlatform(object):
         return getPlatform()
 
 
+import hashlib
+def getMD5(_text):
+    if _text is not None and len(_text)>0:
+        if isinstance(_text,str):
+            bs = _text.encode()
+        elif isinstance(_text,bytes):
+            bs = _text
+        else:
+            return ""
+        md5 = hashlib.md5()
+        md5.update(bs)
+        return md5.hexdigest()
+    return ""
+MAX_NAME_LENGTH = 300
+def get_milvus_standard_name(name):
+    return "%s"%(str(name)[:MAX_NAME_LENGTH].lower())
+
+def get_milvus_product_dict_id(name):
+    return getMD5(get_milvus_standard_name(name))
+
+@annotate('->string')
+class f_getMD5(object):
+
+    def evaluate(self,name):
+        return get_milvus_product_dict_id(name)
+
+
 @annotate('string->string,string,bigint')
 class f_strip_filemd5(BaseUDTF):
 
@@ -97,6 +124,8 @@ class f_strip_filemd5(BaseUDTF):
 
         self.forward(filemd5,filemd5_strip,parts)
 
+
+
 @annotate('string,bigint->string')
 class f_group_filemd5(BaseUDAF):
 

+ 2 - 0
BaseDataMaintenance/model/ots/document.py

@@ -70,6 +70,8 @@ document_nlp_enterprise_attachment = "nlp_enterprise_attachment"
 
 
 document_total_tenderee_money = "total_tenderee_money"
+
+document_update_document = "update_document"
 class Document(BaseModel):
 
     def __init__(self,_dict):

+ 2 - 4
BaseDataMaintenance/model/ots/document_product.py

@@ -49,7 +49,7 @@ DOCUMENT_PRODUCT_ORIGINAL_SPECS = "original_specs"
 
 DOCUMENT_PRODUCT_BID_FILEMD5S = "bid_filemd5s"
 
-
+Document_product_table_name = "document_product2"
 
 class Document_product(BaseModel):
 
@@ -66,9 +66,7 @@ class Document_product(BaseModel):
                     v = _v
             self.setValue(k,v,True)
 
-
-
-        self.table_name = 'document_product'
+        self.table_name = Document_product_table_name
 
     def getPrimary_keys(self):
         return ['id']

+ 70 - 2
BaseDataMaintenance/model/ots/document_product_dict.py

@@ -13,17 +13,23 @@ DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED = "is_synchonized"
 
 DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS = "standard_alias"
 
+DOCUMENT_PRODUCT_DICT_REMOVE_WORDS = "remove_words"
+DOCUMENT_PRODUCT_DICT_LEVEL = "level"
+
 DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS_SEPARATOR = "|"
 
+
 MAX_NAME_LENGTH = 300
 
+Document_product_dict_table_name = "document_product_dict2"
+
 class Document_product_dict(BaseModel):
 
     def __init__(self,_dict):
         BaseModel.__init__(self)
         for k,v in _dict.items():
             self.setValue(k,v,True)
-        self.table_name = "document_product_dict"
+        self.table_name = Document_product_dict_table_name
 
     def getPrimary_keys(self):
         return ["id"]
@@ -46,4 +52,66 @@ def get_document_product_dict_id(parent_md5,name):
     return getMD5(parent_md5+"&&%s"%name)
 
 def get_document_product_dict_standard_alias_id(name):
-    return getMD5("alias&&%s"%name)
+    return get_milvus_product_dict_id(name)
+
+def get_milvus_standard_name(name):
+    return "%s"%(str(name)[:MAX_NAME_LENGTH].lower())
+
+def get_milvus_product_dict_id(name):
+    return getMD5(get_milvus_standard_name(name))
+
+
+
+from BaseDataMaintenance.model.ots.document_product import *
+from BaseDataMaintenance.dataSource.source import getConnect_ots
+from tablestore import *
+from BaseDataMaintenance.common.Utils import *
+from BaseDataMaintenance.common.multiThread import MultiThreadHandler
+from queue import Queue
+def move_document_product_dict():
+
+    bool_query = BoolQuery(must_queries=[
+        ExistsQuery(DOCUMENT_PRODUCT_NAME)
+    ])
+    ots_client = getConnect_ots()
+    Document_product_table_name = "document_product_dict"
+    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("grade")]),limit=100,get_total_count=True),
+                                                                   ColumnsToGet(return_type=ColumnReturnType.ALL))
+    list_data = getRow_ots(rows)
+    while next_token:
+        rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_table_name,Document_product_table_name+"_index",
+                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                       ColumnsToGet(return_type=ColumnReturnType.ALL))
+        list_data.extend(getRow_ots(rows))
+        print("%d/%d"%(len(list_data),total_count))
+        # if len(list_data)>=2000:
+        #     break
+
+    task_queue = Queue()
+
+    dict_id_dict = {}
+
+    for _data in list_data:
+        task_queue.put(_data)
+        id = _data.get(DOCUMENT_PRODUCT_DICT_ID)
+        dict_id_dict[id] = _data
+
+    def _handle(item,result_queue):
+
+        status = item.get(DOCUMENT_PRODUCT_DICT_STATUS)
+
+        D1 = Document_product_dict(item)
+        if status==1:
+
+            D1.update_row(ots_client)
+
+        D1.table_name = Document_product_table_name
+        D1.delete_row(ots_client)
+
+    mt = MultiThreadHandler(task_queue,_handle,None,30)
+    mt.run()
+
+if __name__ == '__main__':
+    # print(get_milvus_product_dict_id("-sl-10xls"))
+    move_document_product_dict()

+ 15 - 8
BaseDataMaintenance/model/ots/document_product_dict_interface.py

@@ -15,25 +15,32 @@ DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS = "standard_alias"
 DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR = "|"
 DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION = "action" #insert delete update
 
+DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_BASE = "base"
+DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_DELETE = "delete"
+DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_UPDATE = "update"
+DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION_INSERT = "insert"
+
+DOCUMENT_PRODUCT_DICT_INTERFACE_REMOVE_WORDS = "remove_words"
+DOCUMENT_PRODUCT_DICT_INTERFACE_LEVEL = "level"
+
 MAX_NAME_LENGTH = 300
 
+Document_product_dict_interface_table_name = "document_product_dict_interface"
+
+
+
 class Document_product_dict_interface(BaseModel):
 
     def __init__(self,_dict):
         BaseModel.__init__(self)
         for k,v in _dict.items():
             self.setValue(k,v,True)
-        self.table_name = "document_product_dict_interface"
+        self.table_name = Document_product_dict_interface_table_name
 
     def getPrimary_keys(self):
         return ["id"]
 
 
-
-
 from BaseDataMaintenance.common.documentFingerprint import getMD5
-def get_document_product_dict_id(parent_md5,name):
-    return getMD5(parent_md5+"&&%s"%name)
-
-def get_document_product_dict_standard_alias_id(name):
-    return getMD5("alias&&%s"%name)
+def get_document_product_dict_interface_base_id(name):
+    return "mdd5="+getMD5(name)

+ 3 - 1
BaseDataMaintenance/model/ots/document_product_tmp.py

@@ -40,13 +40,15 @@ DOCUMENT_PRODUCT_TMP_UPDATE_TIME = 'update_time'
 DOCUMENT_PRODUCT_TMP_NEW_ID = "new_id"
 
 
+Document_product_tmp_table_name = "document_product_temp"
+
 class Document_product_tmp(BaseModel):
 
     def __init__(self,dict):
         BaseModel.__init__(self)
         for k,v in dict.items():
             self.setValue(k,v,True)
-        self.table_name = 'document_product_temp'
+        self.table_name = Document_product_tmp_table_name
 
     def getPrimary_keys(self):
         return ['id']

Деякі файли не було показано, через те що забагато файлів було змінено