Quellcode durchsuchen

参数匹配逻辑自动补全单价,数据,总价中缺失部分;通过参数辅助匹配产品,品牌和型号;增加非法数据过滤

luojiehua vor 1 Jahr
Ursprung
Commit
ffc8b74e98

+ 2 - 0
.gitignore

@@ -2,3 +2,5 @@
 /logs/
 /BaseDataMaintenance/common/download/
 /.idea/
+/attachmentProcessTime2.xlsx
+/BaseDataMaintenance/maintenance/attachment/2022-01-18_183521_export11.xlsx

+ 2 - 0
BaseDataMaintenance/maintenance/product/1.py

@@ -30,6 +30,8 @@ from uuid import uuid4
 
 print(type(uuid4().hex))
 
+print(5==5.00)
+
 
 
 

+ 4 - 1
BaseDataMaintenance/maintenance/product/productUtils.py

@@ -52,16 +52,19 @@ def jaccard_score(source,target):
 
 
 from fuzzywuzzy import fuzz
-def is_similar(source,target):
+def is_similar(source,target,_radio=None):
     source = str(source).lower()
     target = str(target).lower()
     max_len = max(len(source),len(target))
     min_len = min(len(source),len(target))
+
     min_ratio = 90
     if min_len>=3:
         min_ratio = 87
     if min_len>=5:
         min_ratio = 85
+    if _radio is not None:
+        min_ratio = _radio
     # dis_len = abs(len(source)-len(target))
     # min_dis = min(max_len*0.2,4)
     if min_len==0 and max_len>0:

+ 58 - 58
BaseDataMaintenance/maintenance/product/product_dict.py

@@ -363,17 +363,18 @@ class Product_Dict_Manager():
         #update document_product_dict
         if original_id is None or original_id=="":
             original_id = get_document_product_dict_id(parent_id,name)
-        _d = {DOCUMENT_PRODUCT_DICT_ID:original_id,
-              DOCUMENT_PRODUCT_DICT_ALIAS:alias,
-              DOCUMENT_PRODUCT_DICT_NAME:name,
-              DOCUMENT_PRODUCT_DICT_GRADE:grade,
-              DOCUMENT_PRODUCT_DICT_PARENT_ID:parent_id,
-              DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS:standard_alias,
-              DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
-              DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
-              DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S")}
-        _dpd = Document_product_dict(_d)
-        _dpd.update_row(self.ots_client)
+        if parent_id is not None and parent_id!="":
+            _d = {DOCUMENT_PRODUCT_DICT_ID:original_id,
+                  DOCUMENT_PRODUCT_DICT_ALIAS:alias,
+                  DOCUMENT_PRODUCT_DICT_NAME:name,
+                  DOCUMENT_PRODUCT_DICT_GRADE:grade,
+                  DOCUMENT_PRODUCT_DICT_PARENT_ID:parent_id,
+                  DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS:standard_alias,
+                  DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
+                  DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                  DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S")}
+            _dpd = Document_product_dict(_d)
+            _dpd.update_row(self.ots_client)
 
         # search interface if name and grade exists then update document_product_dict and return
         bool_query = BoolQuery(must_queries=[
@@ -451,49 +452,22 @@ class Product_Dict_Manager():
             for s in standard_alias.split(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS_SEPARATOR):
                 if s!="":
                     new_name_set.add(s)
-        if len(new_name_set)==len(old_name_set) and len(new_name_set)==len(new_name_set&old_name_set):
-            return
-
-        # update the milvus
-        Coll,_ = self.get_collection(grade)
-        o_id = original_id
-        expr = " ots_id in ['%s']"%o_id
-        Coll.delete(expr)
-
-        _alias = dpd.getProperties().get(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS)
-        if _alias is not None and _alias!="":
-            list_alias = _alias.split(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS_SEPARATOR)
-            for _alias in list_alias:
-                _alias = _alias.strip()
-                if len(_alias)==0:
-                    continue
-                if _alias==name:
-                    continue
-                _id = get_document_product_dict_standard_alias_id(_alias)
-                expr = " ots_id in ['%s']"%o_id
-                Coll.delete(expr)
 
         if old_name!=name:
             new_id = get_document_product_dict_id(parent_id, name)
         else:
             new_id = original_id
 
-        list_name = []
-        vector = request_embedding(name)
-        if vector is not None and Coll is not None:
-            id = new_id
-            data = [[id],
-                    [name],
-                    [name],
-                    [id],
-                    [vector],
-                    [parent_id],
-                    [grade]]
-            insert_embedding(Coll,data)
-            list_name.append(name)
+        # update the milvus
+        if not (len(new_name_set)==len(old_name_set) and len(new_name_set)==len(new_name_set&old_name_set)):
+            Coll,_ = self.get_collection(grade)
+            o_id = original_id
+            expr = " ots_id in ['%s']"%o_id
+            Coll.delete(expr)
 
-            if standard_alias is not None and standard_alias!="":
-                list_alias = standard_alias.split(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS_SEPARATOR)
+            _alias = dpd.getProperties().get(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS)
+            if _alias is not None and _alias!="":
+                list_alias = _alias.split(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS_SEPARATOR)
                 for _alias in list_alias:
                     _alias = _alias.strip()
                     if len(_alias)==0:
@@ -501,16 +475,42 @@ class Product_Dict_Manager():
                     if _alias==name:
                         continue
                     _id = get_document_product_dict_standard_alias_id(_alias)
-                    vector = request_embedding(_alias)
-                    data = [[_id],
-                            [_alias],
-                            [name],
-                            [id],
-                            [vector],
-                            [parent_id],
-                            [grade]]
-                    insert_embedding(Coll,data)
-                    list_name.append(_alias)
+                    expr = " ots_id in ['%s']"%o_id
+                    Coll.delete(expr)
+
+            list_name = []
+            vector = request_embedding(name)
+            if vector is not None and Coll is not None:
+                id = new_id
+                data = [[id],
+                        [name],
+                        [name],
+                        [id],
+                        [vector],
+                        [parent_id],
+                        [grade]]
+                insert_embedding(Coll,data)
+                list_name.append(name)
+
+                if standard_alias is not None and standard_alias!="":
+                    list_alias = standard_alias.split(DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS_SEPARATOR)
+                    for _alias in list_alias:
+                        _alias = _alias.strip()
+                        if len(_alias)==0:
+                            continue
+                        if _alias==name:
+                            continue
+                        _id = get_document_product_dict_standard_alias_id(_alias)
+                        vector = request_embedding(_alias)
+                        data = [[_id],
+                                [_alias],
+                                [name],
+                                [id],
+                                [vector],
+                                [parent_id],
+                                [grade]]
+                        insert_embedding(Coll,data)
+                        list_name.append(_alias)
 
         # process history
         delete_names = list(old_name_set-new_name_set)
@@ -522,7 +522,7 @@ class Product_Dict_Manager():
 
 
         # update document_product_dict
-        _d = {DOCUMENT_PRODUCT_DICT_ID:original_id,
+        _d = {DOCUMENT_PRODUCT_DICT_ID:new_id,
               DOCUMENT_PRODUCT_DICT_NAME:name,
               DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
               DOCUMENT_PRODUCT_DICT_STANDARD_ALIAS:standard_alias}

+ 201 - 10
BaseDataMaintenance/maintenance/product/products.py

@@ -113,6 +113,54 @@ class Product_Manager(Product_Dict_Manager):
         self.standardize(item)
 
 
+    def match_specs(self,specs):
+        bool_query = BoolQuery(must_queries=[
+            TermQuery(DOCUMENT_PRODUCT_DICT_NAME,specs),
+            TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,SPECS_GRADE)
+        ])
+        rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product_dict","document_product_dict_index",
+                                                                            SearchQuery(bool_query,get_total_count=True))
+        if total_count>0:
+            new_specs = specs
+            return new_specs
+        else:
+            debug("getting sepcs %s"%(specs))
+            list_specs = []
+            c_specs = clean_product_specs(specs)
+            list_specs.append(c_specs)
+
+            for s in re.split("[\u4e00-\u9fff]",specs):
+                if s!="" and len(s)>4:
+                    list_specs.append(s)
+            similar_flag = None
+            _index = 0
+            break_flag = False
+            for c_specs in list_specs:
+                if break_flag:
+                    break
+                _index += 1
+                specs_vector = request_embedding(c_specs)
+
+                if specs_vector is not None:
+                    Coll,_ = self.get_collection(SPECS_GRADE)
+                    search_list = search_embedding(Coll,embedding_index_name,[specs_vector],self.search_params,output_fields,limit=60)
+
+                    for _search in search_list:
+
+                        ots_id = _search.entity.get("standard_name_id")
+                        ots_name = _search.entity.get("standard_name")
+                        ots_parent_id = _search.entity.get("ots_parent_id")
+
+                        debug("checking specs %s and %s"%(specs,ots_name))
+                        if is_similar(specs,ots_name):
+                            # log("specs is_similar")
+                            if check_specs(c_specs,ots_name):
+                                break_flag = True
+                                new_specs = ots_name
+                                return new_specs
+
+
+
     def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]):
         '''
         Standardizes the product data
@@ -138,16 +186,15 @@ class Product_Manager(Product_Dict_Manager):
         name = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,"")
         brand = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,"")
         specs = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,"")
+        parameters = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_PARAMETER,"")
+
+        list_candidates = [a for a in [name,brand,specs,parameters] if a!=""]
 
-        max_len = max(len(name),len(brand),len(specs))
-        max_len_str = name if len(name)==max_len else brand if len(brand)==max_len else specs
+        if brand=="" and parameters!="":
+            brand = parameters
+        if specs=="" and parameters!="":
+            specs = parameters
 
-        if name=="" and max_len>=8:
-            name = max_len_str
-        if brand=="" and max_len>=8:
-            brand = max_len_str
-        if specs=="" and max_len>=8:
-            specs = max_len_str
 
         new_name = ""
         new_brand = ""
@@ -177,6 +224,28 @@ class Product_Manager(Product_Dict_Manager):
                         # if _flag and _dpd.updateAlias(name):
                         #     _dpd.update_row(self.ots_client)
                         break
+        if name_ots_id is None:
+            for name in list_candidates:
+                name_vector = request_embedding(name)
+                if name_vector is not None:
+                    Coll,_ = self.get_collection(NAME_GRADE)
+                    search_list = search_embedding(Coll,embedding_index_name,[name_vector],self.search_params,output_fields,limit=60)
+
+                    for _search in search_list:
+                        ots_id = _search.entity.get("standard_name_id")
+                        ots_name = _search.entity.get("standard_name")
+                        ots_parent_id = _search.entity.get("ots_parent_id")
+
+                        if is_similar(name,ots_name) or check_product(name,ots_name):
+                            name_ots_id = ots_id
+                            new_name = ots_name
+
+                            # #update alias of name
+                            # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
+                            # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
+                            # if _flag and _dpd.updateAlias(name):
+                            #     _dpd.update_row(self.ots_client)
+                            break
         if name_ots_id is not None:
 
             if brand is not None and brand!="":
@@ -277,7 +346,67 @@ class Product_Manager(Product_Dict_Manager):
                                 dpdi.update_row(self.ots_client)
                                 break
 
+            if brand_ots_id is None:
+                _find = False
+                for brand in list_candidates:
+                    if _find:
+                        break
+                    l_brand = [brand]
+                    l_brand.append(clean_product_brand(s_brand))
+                    brand_ch = get_chinese_string(brand)
+                    l_brand.extend(brand_ch)
+
+                    for brand in l_brand:
+                        if _find:
+                            break
+                        brand_vector = request_embedding(brand)
+                        if brand_vector is not None:
+                            Coll,_ = self.get_collection(BRAND_GRADE)
+                            search_list = search_embedding(Coll,embedding_index_name,[brand_vector],self.search_params,output_fields,limit=60)
+
+                            # log("search brand %s"%(brand))
+                            for _search in search_list:
 
+                                ots_id = _search.entity.get("standard_name_id")
+                                ots_name = _search.entity.get("standard_name")
+                                ots_parent_id = _search.entity.get("ots_parent_id")
+
+                                # log("check brand %s and %s"%(brand,ots_name))
+                                if is_similar(brand,ots_name,_radio=95) or check_brand(brand,ots_name):
+                                    # log("check brand similar succeed:%s and %s"%(brand,ots_name))
+                                    new_brand = ots_name
+                                    log("checking brand %s succeed %s"%(brand,new_brand))
+                                    # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
+
+                                    if name_ots_id is not None:
+                                        brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
+
+                                        _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
+                                                    DOCUMENT_PRODUCT_DICT_NAME:new_brand,
+                                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
+                                                    DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
+                                                    DOCUMENT_PRODUCT_DICT_STATUS:1,
+                                                    DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
+                                                    DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
+                                                    DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                                    DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                                    }
+                                        _dpd_brand = Document_product_dict(_d_brand)
+                                        # _dpd_brand.updateAlias(str(new_brand).lower())
+                                        if not _dpd_brand.exists_row(self.ots_client):
+                                            _dpd_brand.update_row(self.ots_client)
+
+                                        else:
+                                            pass
+                                            # #update alias
+                                            # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
+                                            # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
+                                            # if _flag:
+                                            #     if _dpd.updateAlias(brand):
+                                            #         _dpd.update_row(self.ots_client)
+
+                                    _find = True
+                                    break
 
             if specs is not None and specs!="":
 
@@ -432,10 +561,39 @@ class Product_Manager(Product_Dict_Manager):
                                       }
                                 _dpdi = Document_product_dict_interface(_d)
                                 _dpdi.update_row(self.ots_client)
+        if specs_ots_id is None:
+            _find = False
+            for specs in list_candidates:
+                if _find:
+                    break
+                s = self.match_specs(specs)
+                if s is not None:
+                    new_specs = s
+                    if brand_ots_id is not None:
+                        # judge if the specs which parent_id is brand_ots_id exists,insert one if not exists else update alias
+                        specs_ots_id = get_document_product_dict_id(brand_ots_id,new_specs)
+
+                        _d_specs = {DOCUMENT_PRODUCT_DICT_ID:specs_ots_id,
+                                    DOCUMENT_PRODUCT_DICT_NAME:new_specs,
+                                    DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(specs).lower()),
+                                    DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
+                                    DOCUMENT_PRODUCT_DICT_STATUS:1,
+                                    DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
+                                    DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
+                                    DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                    DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
+                                    }
+                        _dpd_specs = Document_product_dict(_d_specs)
+                        # _dpd_specs.updateAlias(str(new_specs).lower())
+                        if not _dpd_specs.exists_row(self.ots_client):
+                            _dpd_specs.update_row(self.ots_client)
+                        _find = True
+                        break
 
         # judge if the product matches the standard product
         if name_ots_id is not None:
 
+            is_legal_data = True
             #standard the product and same to document_product table
             _product = Document_product(tmp_dict)
             docid = _product.getProperties().get(DOCUMENT_PRODUCT_DOCID)
@@ -445,11 +603,41 @@ class Product_Manager(Product_Dict_Manager):
             unit_price = clean_product_unit_price(unit_price)
             quantity = clean_product_quantity(quantity)
 
+            total_price = _product.getProperties().get(DOCUMENT_PRODUCT_TOTAL_PRICE)
+
             _product.setValue(DOCUMENT_PRODUCT_UNIT_PRICE,unit_price,True)
             _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
-            if isinstance(unit_price,(float,int)) and isinstance(quantity,(float,int)):
+
+            win_bid_price = _product.getProperties().get(DOCUMENT_PRODUCT_WIN_BID_PRICE)
+
+
+            if isinstance(unit_price,(float,int)) and isinstance(quantity,(float,int)) and isinstance(total_price,(float,int)):
+                new_quantity = total_price/unit_price
+                if new_quantity!=quantity:
+                    if new_quantity==total_price//unit_price:
+                        quantity = int(new_quantity)
+                        _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
+                    else:
+                        is_legal_data = False
+            elif isinstance(unit_price,(float,int)) and isinstance(quantity,(float,int)):
                 total_price = float("%.2f"%(unit_price*quantity))
                 _product.setValue(DOCUMENT_PRODUCT_TOTAL_PRICE,total_price,True)
+            elif isinstance(unit_price,(float,int)) and isinstance(total_price,(float,int)):
+                quantity = int(total_price//unit_price)
+                _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
+            elif isinstance(quantity,(float,int)) and isinstance(total_price,(float,int)):
+                unit_price = float("%.2f"%(total_price/quantity))
+                _product.setValue(DOCUMENT_PRODUCT_UNIT_PRICE,unit_price,True)
+            elif isinstance(quantity,(float,int)) and quantity>10000:
+                is_legal_data = False
+
+            if isinstance(_product.getProperties().get(DOCUMENT_PRODUCT_TOTAL_PRICE),(float,int)) and isinstance(win_bid_price,(float,int)):
+                if _product.getProperties().get(DOCUMENT_PRODUCT_TOTAL_PRICE)>win_bid_price:
+                    is_legal_data = False
+
+            if isinstance(_product.getProperties().get(DOCUMENT_PRODUCT_UNIT_PRICE),(float,int)) and _product.getProperties().get(DOCUMENT_PRODUCT_UNIT_PRICE)>100000000:
+                is_legal_data = False
+
 
             new_id = self.get_product_id(docid,new_name,new_brand,new_specs,unit_price,quantity)
 
@@ -481,7 +669,10 @@ class Product_Manager(Product_Dict_Manager):
             if bid_filemd5s is not None:
                 _product.setValue(DOCUMENT_PRODUCT_BID_FILEMD5S,bid_filemd5s,True)
 
-            if self.dumplicate(_product):
+            if not is_legal_data:
+                _status = randint(501,550)
+
+            elif self.dumplicate(_product):
                 _status = randint(201,300)
                 save_product_tmp.setValue(DOCUMENT_PRODUCT_TMP_NEW_ID,new_id,True)
 

+ 1 - 0
BaseDataMaintenance/model/ots/document_product.py

@@ -25,6 +25,7 @@ DOCUMENT_PRODUCT_TENDEREE_CONTACT = 'tenderee_contact'
 DOCUMENT_PRODUCT_PROCUREMENT_SYSTEM = 'procurement_system'
 DOCUMENT_PRODUCT_BIDDING_BUDGET = 'bidding_budget'
 DOCUMENT_PRODUCT_WIN_TENDERER = 'win_tenderer'
+DOCUMENT_PRODUCT_WIN_BID_PRICE = "win_bid_price"
 DOCUMENT_PRODUCT_PROVINCE = 'province'
 DOCUMENT_PRODUCT_CITY = 'city'
 DOCUMENT_PRODUCT_DISTRICT = 'district'