123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- from BaseDataMaintenance.maintenance.product.product_setting import *
- import re
- # 判断是不是入参字符串为全中文
- def judge_pur_chinese(keyword):
- """
- 中文字符的编码范围为: u'\u4e00' -- u'\u9fff:只要在此范围内就可以判断为中文字符串
- @param keyword:
- @return:
- """
- # 定义一个需要删除的标点符号字符串列表
- remove_chars = '[·’!"\#$%&\'()#!()*+,-./:;<=>?\@,:?¥★、….>【】[]《》?“”‘’\[\\]^_`{|}~]+'
- # 利用re.sub来删除中文字符串中的标点符号
- strings = re.sub(remove_chars, "", keyword) # 将keyword中文字符串中remove_chars中包含的标点符号替换为空字符串
- for ch in strings:
- if u'\u4e00' <= ch <= u'\u9fff':
- pass
- else:
- return False
- return True
- from fuzzywuzzy import fuzz
- def is_similar(source,target):
- source = str(source).lower()
- target = str(target).lower()
- max_len = max(len(source),len(target))
- min_len = min(len(source),len(target))
- # dis_len = abs(len(source)-len(target))
- # min_dis = min(max_len*0.2,4)
- if min_len==0 and max_len>0:
- return False
- if max_len<=4:
- if source==target:
- return True
- else:
- #判断相似度
- similar = fuzz.ratio(source,target)
- if similar>90:
- return True
- # 全中文判断是否包含
- if judge_pur_chinese(source) and judge_pur_chinese(target):
- if len(source)==max_len:
- if str(source).find(target)>=0:
- return True
- else:
- if target.find(source)>=0:
- return True
- return False
- SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
- def check_specs(source,target):
- '''
- check if the source specs is the same as the target
- same only if the chars in SPECS_CHECK_SET have the same counts
- :param source:
- :param target:
- :return:
- '''
- source = str(source).lower()
- target = str(target).lower()
- dict_source = {}
- dict_target = {}
- for s in source:
- if s in SPECS_CHECK_SET:
- if s not in dict_source:
- dict_source[s] = 0
- dict_source[s] += 1
- for s in target:
- if s in SPECS_CHECK_SET:
- if s not in dict_target:
- dict_target[s] = 0
- dict_target[s] += 1
- union_keys = set(list(dict_source.keys())) & set(list(dict_target.keys()))
- if len(dict_source.keys())!= len(union_keys):
- return False
- for k,v in dict_source.items():
- if v!=dict_target.get(k):
- return False
- return True
- import json
- import requests
- session = requests.Session()
- def request_embedding(sentence,retry_times=3):
- for _ in range(retry_times):
- resp = session.post(embedding_url,json={"sentence":sentence})
- if resp.status_code==200:
- content = resp.content.decode("utf-8")
- _d = json.loads(content)
- if _d.get("success"):
- return _d.get("vector")
- return None
- def clean_product_name(product_name):
- '''
- clean before insert
- :param product_name:
- :return:
- '''
- return product_name
- def clean_product_brand(product_brand):
- '''
- clean before insert
- :param product_brand:
- :return:
- '''
- return product_brand
- SPECS_PATTERN = re.compile("[^A-Za-z0-9-\\/()()]")
- def clean_product_specs(product_specs):
- '''
- clean before insert
- :param product_specs:
- :return:
- '''
- _specs = re.sub(SPECS_PATTERN,'',product_specs)
- if len(_specs)>0:
- return _specs
- return product_specs
- def clean_product_unit_price(product_unit_price):
- '''
- clean before insert
- :param product_unit_price:
- :return:
- '''
- try:
- if product_unit_price is not None and product_unit_price!="":
- _price = float(product_unit_price)
- return _price
- except Exception as e:
- return ""
- return ""
- def clean_product_quantity(product_quantity):
- '''
- :param product_quantity:
- :return:
- '''
- try:
- if product_quantity is not None and product_quantity!="":
- _quantity = int(product_quantity)
- return _quantity
- except Exception as e:
- return ""
- return ""
- if __name__ == '__main__':
- print(clean_product_specs("XY-K-JLJ-3A"))
- print(check_specs("佳士比F6",'佳士比”F6'))
|