Browse Source

优化产品处理流匹配规则

luojiehua 1 year ago
parent
commit
d15905b651

+ 4 - 0
.gitignore

@@ -0,0 +1,4 @@
+/BaseDataMaintenance/maintenance/attachment/fixdownload/
+/logs/
+/BaseDataMaintenance/common/download/
+/.idea/

+ 59 - 11
BaseDataMaintenance/maintenance/product/productUtils.py

@@ -52,19 +52,36 @@ def is_similar(source,target):
     return False
 
 
-SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
 
-def check_specs(source,target):
-    '''
-    check if the source specs is the same as the target
-    same only if the chars in SPECS_CHECK_SET have the same counts
-    :param source:
-    :param target:
-    :return:
-    '''
+def is_contain(source,target,min_len=2):
+    if len(source)>=len(target) and target in source and len(target)>=min_len:
+        return True
+    if len(target)>len(source) and source in target and len(source)>=min_len:
+        return True
+    return False
+
+def check_product(source,target):
+    if is_contain(source,target,min_len=3):
+        return True
+    return False
+
+
+def check_brand(source,target):
+    source = str(source).lower()
+    target = str(target).lower()
+
+    if is_contain(source,target):
+        return True
+
+SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789.']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
+SPECS_PATTERN = re.compile("[^%s]"%("".join(list(SPECS_CHECK_SET))))
+
+def has_same_specs_count(source, target):
+
     source = str(source).lower()
     target = str(target).lower()
 
+    # just take care of type and count,lack of order
     dict_source = {}
     dict_target = {}
     for s in source:
@@ -83,8 +100,39 @@ def check_specs(source,target):
     for k,v in dict_source.items():
         if v!=dict_target.get(k):
             return False
+
     return True
 
+def check_specs(source,target):
+    '''
+    check if the source specs is the same as the target
+    same only if the chars in SPECS_CHECK_SET have the same counts
+    :param source:
+    :param target:
+    :return:
+    '''
+    source = str(source).lower()
+    target = str(target).lower()
+
+    source = re.sub(SPECS_PATTERN,'',source)
+    target = re.sub(SPECS_PATTERN,'',target)
+
+    if source==target and len(source)>0:
+        return True
+
+    if has_same_specs_count(source,target):
+        _index = 0
+        for _i in range(min(len(source),len(target))):
+            _index = -(_i+1)
+            if source[_index]!=target[_index]:
+                break
+        if abs(_index)>min(len(source),len(target))//2:
+            return True
+
+    return False
+
+
+
 import json
 
 import requests
@@ -115,7 +163,7 @@ def clean_product_brand(product_brand):
     '''
     return product_brand
 
-SPECS_PATTERN = re.compile("[^A-Za-z0-9-\\/()()]")
+SPECS_PATTERN = re.compile("[^A-Za-z0-9-\\/()().]")
 def clean_product_specs(product_specs):
     '''
     clean before insert
@@ -160,4 +208,4 @@ def clean_product_quantity(product_quantity):
 
 if __name__ == '__main__':
     print(clean_product_specs("XY-K-JLJ-3A"))
-    print(check_specs("佳士比F6",'佳士比”F6'))
+    print(check_specs("3.6",'3.6'))

+ 14 - 3
BaseDataMaintenance/maintenance/product/products.py

@@ -130,6 +130,16 @@ class Product_Manager(Product_Dict_Manager):
         brand = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,"")
         specs = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,"")
 
+        max_len = max(len(name),len(brand),len(specs))
+        max_len_str = name if len(name)==max_len else brand if len(brand)==max_len else specs
+
+        if name=="" and max_len>=8:
+            name = max_len_str
+        if brand=="" and max_len>=8:
+            brand = max_len_str
+        if specs=="" and max_len>=8:
+            specs = max_len_str
+
         new_name = ""
         new_brand = ""
         new_specs = ""
@@ -148,7 +158,7 @@ class Product_Manager(Product_Dict_Manager):
                     ots_name = _search.entity.get("standard_name")
                     ots_parent_id = _search.entity.get("ots_parent_id")
 
-                    if is_similar(name,ots_name):
+                    if is_similar(name,ots_name) or check_product(name,ots_name):
                         name_ots_id = ots_id
                         new_name = ots_name
 
@@ -171,7 +181,7 @@ class Product_Manager(Product_Dict_Manager):
                         ots_name = _search.entity.get("standard_name")
                         ots_parent_id = _search.entity.get("ots_parent_id")
 
-                        if is_similar(brand,ots_name):
+                        if is_similar(brand,ots_name) or check_brand(brand,ots_name):
                             new_brand = ots_name
 
                             # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
@@ -446,4 +456,5 @@ def start_process_product():
 
 if __name__ == '__main__':
 
-    start_process_product()
+    # start_process_product()
+    print(getMD5('11936c56f2dd1426764e317ca2e8e1a7'+'&&鱼跃'))

+ 1 - 1
BaseDataMaintenance/model/ots/document_product_dict.py

@@ -46,4 +46,4 @@ def get_document_product_dict_id(parent_md5,name):
     return getMD5(parent_md5+"&&%s"%name)
 
 def get_document_product_dict_standard_alias_id(name):
-    return getMD5("alias%s"%name)
+    return getMD5("alias&&%s"%name)