123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596 |
- from BaseDataMaintenance.maintenance.product.product_setting import *
- import Levenshtein
- import re
- # 判断是不是入参字符串为全中文
- from BaseDataMaintenance.dataSource.source import getConnect_redis_product_pool
- from BaseDataMaintenance.dataSource.pool import ConnectorPool
- from BaseDataMaintenance.common.Utils import log
- from BaseDataMaintenance.common.documentFingerprint import getMD5
- from BaseDataMaintenance.common.milvusUtil import search_embedding
- import redis
- pool_product = getConnect_redis_product_pool()
- import traceback
- from tablestore import *
- from BaseDataMaintenance.model.ots.document_product_dict_interface import *
- from BaseDataMaintenance.model.ots.document_product_dict import *
- from BaseDataMaintenance.model.ots.document_product_tmp import *
- from BaseDataMaintenance.model.ots.enterprise import *
- from BaseDataMaintenance.maintenance.product.make_brand_pattern import get_area_set
- area_set = get_area_set()
- import jieba
- ots_client = getConnect_ots()
- def get_intellect_search(coll,index_name,name,grade,search_params,output_fields,limit,max_steps=5):
- vector = []
- v = get_embedding_request(name)
- if v is not None:
- vector.append(v)
- if len(str(name))>=5:
- name_cut = list(jieba.cut(name))
- strides = [1,2]
- for stride in strides:
- steps = len(name_cut)//stride
- if len(name)%stride>=stride//2+1:
- steps += 1
- _begin = 0
- _name = ""
- for i in range(min(steps,max_steps)):
- _name += "".join(name_cut[i*stride:(i+1)*stride])
- if len(_name)<2:
- continue
- v = get_embedding_request(_name)
- if v is not None:
- vector.append(v)
- _name = ""
- if len(vector)>0:
- list_search = get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit)
- if list_search:
- return list_search
- return []
- def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit=3):
- if name is None or name=="":
- return None
- db = redis.Redis(connection_pool=pool_product)
- try:
- _md5 = getMD5(str(name))+"_milvus_%d"%(grade)
- _search_list = None
- try:
- _search_list = db.get(_md5)
- except Exception as e:
- log("get redis data error")
- if _search_list is not None:
- # log("_search_list is not None")
- return json.loads(_search_list)
- else:
- # log("search from milvus")
- list_result = []
- result = coll.search(vector,index_name,search_params,top_k=limit,output_fields=output_fields,limit=limit)
- for hits in result:
- for hit in hits:
- list_result.append(hit)
- final_list = []
- for _search in list_result:
- _d = {}
- for k in output_fields:
- _d[k] = _search.entity.get(k)
- final_list.append(_d)
- final_list = remove_repeat_item(final_list,k="ots_name")
- for _d in final_list:
- # _d["length_dis"] = abs(len(_d.get("standard_name",""))-len(name))
- standard_set = set(_d.get("standard_name",""))
- name_set = set(name)
- _d["length_dis"] = len(standard_set&name_set)/max(len(standard_set)+len(name_set),1)
- final_list.sort(key=lambda x:x.get("length_dis",0),reverse=True)
- final_list.sort(key=lambda x:x.get("level",1))
- try:
- db.set(_md5,json.dumps(final_list))
- db.expire(_md5,PRODUCT_REDIS_CACHE_TIME)
- except Exception as e:
- traceback.print_exc()
- log("set redis data error")
- return final_list
- except Exception as e:
- traceback.print_exc()
- raise RuntimeError("get milvus search error")
- return None
- def remove_repeat_item(list_result,k="standard_name"):
- final_list = []
- set_k = set()
- for item in list_result:
- _v = item.get(k)
- if _v is not None and _v in set_k:
- continue
- final_list.append(item)
- set_k.add(_v)
- return final_list
- def get_embedding_request(sentence,retry_times=3):
- if sentence is None or sentence=="":
- return None
- db = redis.Redis(connection_pool=pool_product)
- try:
- _md5 = getMD5(get_milvus_standard_name(sentence))+"_embedding"
- _embedding = None
- try:
- _embedding = db.get(_md5)
- except Exception as e:
- log("get redis data error")
- if _embedding is not None:
- return json.loads(_embedding)
- else:
- _embedding = request_embedding(sentence,retry_times=retry_times)
- if _embedding is not None:
- try:
- db.set(_md5,json.dumps(_embedding))
- db.expire(_md5,60*60)
- except Exception as e:
- traceback.print_exc()
- log("set redis data error")
- return _embedding
- except Exception as e:
- traceback.print_exc()
- raise RuntimeError("get embedding request error")
- return None
- def judge_pur_chinese(keyword):
- """
- 中文字符的编码范围为: u'\u4e00' -- u'\u9fff:只要在此范围内就可以判断为中文字符串
- @param keyword:
- @return:
- """
- # 定义一个需要删除的标点符号字符串列表
- remove_chars = '[·’!"\#$%&\'()#!()*+,-./:;<=>?\@,:?¥★、….>【】[]《》?“”‘’\[\\]^_`{|}~]+'
- # 利用re.sub来删除中文字符串中的标点符号
- strings = re.sub(remove_chars, "", keyword) # 将keyword中文字符串中remove_chars中包含的标点符号替换为空字符串
- for ch in strings:
- if u'\u4e00' <= ch <= u'\u9fff':
- pass
- else:
- return False
- return True
- def get_chinese_string(string):
- list_s = []
- for s in re.split("[^\u4e00-\u9fff]",string):
- if s!="" and len(s)>=2:
- list_s.append(s)
- return list_s
- def is_area_brand(brand,area_set):
- brand = re.sub("[省市区县等]","",brand)
- if len(brand)>12:
- return 0
- if brand in area_set:
- return 2
- for _i in range(2,len(brand)):
- ns = brand[:_i]
- ns1 = brand[_i:]
- if ns in area_set and (ns1 in area_set or ns1==""):
- return 2
- if ns in area_set and len(brand)-_i<=5 and len(brand)-_i>=2:
- return 1
- return 0
- def jaccard_score(source,target):
- source_set = set([s for s in source])
- target_set = set([s for s in target])
- if len(source_set)==0 or len(target_set)==0:
- return 0
- return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
- from fuzzywuzzy import fuzz
- def is_similar(source,target,_radio=None):
- source = str(source).lower()
- target = str(target).lower()
- max_len = max(len(source),len(target))
- min_len = min(len(source),len(target))
- min_ratio = 90
- if min_len>=3:
- min_ratio = 87
- if min_len>=5:
- min_ratio = 85
- if _radio is not None:
- min_ratio = _radio
- # dis_len = abs(len(source)-len(target))
- # min_dis = min(max_len*0.2,4)
- if min_len==0 and max_len>0:
- return False
- if max_len<=2:
- if source==target:
- return True
- if min_len<2:
- return False
- #判断相似度
- similar = fuzz.ratio(source,target)
- if similar>=min_ratio:
- log("%s and %s similar_jaro %d"%(source,target,similar))
- return True
- similar_jaro = Levenshtein.jaro(source,target)
- if similar_jaro*100>=min_ratio:
- log("%s and %s similar_jaro %d"%(source,target,similar_jaro*100))
- return True
- similar_jarow = Levenshtein.jaro_winkler(source,target)
- if similar_jarow*100>=min_ratio:
- log("%s and %s similar_jaro %d"%(source,target,similar_jarow*100))
- return True
- if min_len>=5:
- if len(source)==max_len and str(source).find(target)>=0:
- return True
- elif len(target)==max_len and target.find(source)>=0:
- return True
- elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
- return True
- return False
- def is_contain(source,target,min_len=2):
- if len(source)>=len(target) and target in source and len(target)>=min_len:
- return True
- if len(target)>len(source) and source in target and len(source)>=min_len:
- return True
- return False
- def check_char(source,target,chat_pattern=re.compile("^[a-zA-Z0-9\-]+$"),find_pattern=re.compile("(?P<product>[a-zA-Z0-9-]+)")):
- if re.search(chat_pattern,source) is not None or re.search(chat_pattern,target) is not None:
- a = set(re.findall(find_pattern,source))
- b = set(re.findall(find_pattern,target))
- if len(a&b)>0:
- return True
- else:
- return False
- def check_product(source,target,remove_words):
- if remove_words is not None and remove_words!="":
- _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
- list_split = [a.strip() for a in _split if a.strip()!=""]
- for _s in list_split:
- if str(source).find(_s)>=0:
- return False
- _check = check_char(source,target)
- if _check:
- return True
- else:
- if _check==False:
- return False
- if len(source)>len(target) and target in source:
- return True
- max_len = max(len(source),len(target))
- min_len = min(len(source),len(target))
- if min_len<2:
- return False
- elif max_len<=5:
- min_ratio=96
- else:
- min_ratio = 95
- min_ratio = 98
- if is_similar(source,target,min_ratio):
- return True
- return False
- def check_brand(source,target,remove_words):
- source = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(source).lower())
- target = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(target).lower())
- if remove_words is not None and remove_words!="":
- _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
- list_split = [a.strip() for a in _split if a.strip()!=""]
- for _s in _split:
- if str(source).find(_s)>=0:
- return False
- max_len = max(len(source),len(target))
- min_len = min(len(source),len(target))
- if min_len<2:
- return False
- elif max_len<=5:
- min_ratio=94
- else:
- min_ratio = 90
- min_ratio = 98
- source_c = "".join(get_chinese_string(source))
- target_c = "".join(get_chinese_string(target))
- _check = check_char(source,target)
- if _check:
- return True
- else:
- if _check==False:
- return False
- if len(source_c)>=2 and len(target_c)>=2:
- if not(source_c in area_set or target_c in area_set):
- if is_contain(source_c,target_c):
- return True
- if is_similar(source_c,target_c,min_ratio):
- return True
- else:
- return False
- if has_same_specs_count(source,target):
- if is_contain(source,target):
- return True
- if is_similar(source,target,min_ratio):
- return True
- SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789.']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
- NOT_SPECS_PATTERN = re.compile("[^%s]"%("".join(list(SPECS_CHECK_SET))))
- def has_same_specs_count(source, target):
- source = str(source).lower()
- target = str(target).lower()
- # just take care of type and count,lack of order
- dict_source = {}
- dict_target = {}
- for s in source:
- if s in SPECS_CHECK_SET:
- if s not in dict_source:
- dict_source[s] = 0
- dict_source[s] += 1
- for s in target:
- if s in SPECS_CHECK_SET:
- if s not in dict_target:
- dict_target[s] = 0
- dict_target[s] += 1
- union_keys = set(list(dict_source.keys())) & set(list(dict_target.keys()))
- if len(dict_source.keys())!= len(union_keys) or len(dict_target.keys())!= len(union_keys):
- return False
- for k,v in dict_source.items():
- if v!=dict_target.get(k):
- return False
- return True
- def is_legal_brand(ots_client,brand):
- _search = re.search("品牌[::;;](?P<brand>.{2,8}?)([.。、;::]|规格|型号|生产厂家|厂家)",brand)
- if _search is not None:
- brand = _search.groupdict().get("brand")
- if brand is None or len(brand)<2:
- return False
- # check whether this brand exists in interface and action is delete
- bool_query = BoolQuery(must_queries=[
- TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_NAME,brand),
- TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE,BRAND_GRADE),
- TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION,"delete")
- ])
- rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
- SearchQuery(bool_query,get_total_count=True))
- if total_count>0:
- return False
- # check whether this brand exists in dict and grade=name_grade or grade=specs_grade
- bool_query = BoolQuery(must_queries=[
- TermQuery(DOCUMENT_PRODUCT_DICT_NAME,brand),
- BoolQuery(should_queries=[
- TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,NAME_GRADE),
- TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,SPECS_GRADE)
- ])
- ])
- rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
- SearchQuery(bool_query,get_total_count=True))
- if total_count>0:
- return False
- # check the area+brand type
- _f = is_area_brand(brand,area_set)
- if _f==1:
- log("%s is_legal_brand True by is_area_brand"%(brand))
- return True
- elif _f==2:
- return False
- # check the company type
- if len(brand)<100 and len(brand)>=8:
- _d = {ENTERPRISE_NAME:brand}
- _ent = Enterprise(_d)
- if _ent.exists_row(ots_client):
- _ent.fix_columns(ots_client,[ENTERPRISE_bid_number,ENTERPRISE_STATUS,ENTERPRISE_tyc_id],True)
- if _ent.getProperties().get(ENTERPRISE_STATUS,0)>=201 and _ent.getProperties().get(ENTERPRISE_STATUS,0)<=300:
- if _ent.getProperties().get(ENTERPRISE_bid_number,0)>0 or _ent.getProperties().get(ENTERPRISE_tyc_id,0):
- log("%s is_legal_brand True by Enterprise"%(brand))
- return True
- # check the group count and char
- bool_query = BoolQuery(must_queries=[
- TermQuery(DOCUMENT_PRODUCT_TMP_BRAND,brand)
- ])
- rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_tmp_table_name,Document_product_tmp_table_name+"_index",
- SearchQuery(bool_query,get_total_count=True))
- if total_count>=5:
- new_brand = re.sub("[^\u4e00-\u9fff]",'',brand)
- if re.search("详见|无|国产|null|其他|详细|废标|[0-9/]|品牌|文件|^见",brand) is None and len(brand)<=8:
- log("%s is_legal_brand True by count"%(brand))
- return True
- SPECS_PATTERN = re.compile("[^A-Za-z0-9-\\/()().]")
- def is_legal_specs(specs):
- if specs is None or specs=="":
- return False
- specs = str(specs).lower()
- if re.search(SPECS_PATTERN,specs) is not None:
- return False
- # for s in specs:
- # if re.search(SPECS_PATTERN,s) is not None:
- # return False
- return True
- def check_specs(source,target):
- '''
- check if the source specs is the same as the target
- same only if the chars in SPECS_CHECK_SET have the same counts
- :param source:
- :param target:
- :return:
- '''
- source = str(source).lower()
- target = str(target).lower()
- source = re.sub(NOT_SPECS_PATTERN,'',source)
- target = re.sub(NOT_SPECS_PATTERN,'',target)
- if source==target and len(source)>0:
- return True
- if has_same_specs_count(source,target):
- _index = 0
- for _i in range(min(len(source),len(target))):
- _index = -(_i+1)
- if source[_index]!=target[_index]:
- break
- if abs(_index)>min(len(source),len(target))//2:
- return True
- return False
- import json
- import requests
- session = requests.Session()
- def request_embedding(sentence,retry_times=3):
- for _ in range(retry_times):
- sentence = get_milvus_standard_name(sentence)
- resp = session.post(embedding_url,json={"sentence":sentence})
- if resp.status_code==200:
- content = resp.content.decode("utf-8")
- _d = json.loads(content)
- if _d.get("success"):
- return _d.get("vector")
- return None
- def clean_product_name(product_name):
- '''
- clean before insert
- :param product_name:
- :return:
- '''
- return product_name
- def clean_product_brand(product_brand):
- '''
- clean before insert
- :param product_brand:
- :return:
- '''
- _search = re.search("品牌[::;;](?P<brand>.{2,8}?)([.。、;::]|规格|型号|生产厂家|厂家)",product_brand)
- if _search is not None:
- product_brand = _search.groupdict().get("brand")
- brand = re.sub("[/\\,,、.|等]|一批|/无|品牌|^[/.]+",'',product_brand)
- for i in range(min(len(brand)-2,8)):
- _n = brand[:i+1]
- if _n in area_set:
- n_name = re.sub("^[省市区]]",'',brand[i+1:])
- face_id = get_document_product_dict_interface_base_id(n_name,BRAND_GRADE)
- _interface_d = {
- DOCUMENT_PRODUCT_DICT_INTERFACE_ID:face_id
- }
- _dpdi = Document_product_dict_interface(_interface_d)
- if _dpdi.exists_row(ots_client):
- brand = n_name
- break
- return brand
- def clean_product_specs(product_specs,_PATTERN = re.compile("[^A-Za-z0-9-\\/()().×*]|^[\\/.-]+")):
- '''
- clean before insert
- :param product_specs:
- :return:
- '''
- _specs = re.sub(_PATTERN,'',product_specs)
- if len(_specs)>0:
- return _specs
- return product_specs
- def clean_product_unit_price(product_unit_price):
- '''
- clean before insert
- :param product_unit_price:
- :return:
- '''
- try:
- if product_unit_price is not None and product_unit_price!="":
- _price = float(product_unit_price)
- return _price
- except Exception as e:
- return ""
- return ""
- def clean_product_quantity(product_quantity):
- '''
- :param product_quantity:
- :return:
- '''
- try:
- if product_quantity is not None and product_quantity!="":
- _quantity = int(product_quantity)
- return _quantity
- except Exception as e:
- return ""
- return ""
- if __name__ == '__main__':
- # print(check_brand('DYW-JY-T01-A1(定制)','JY',''))
- # print(check_product("医用冷藏箱","医用","a|"))
- # print(re.split("[^\u4e00-\u9fff]",'128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10'))
- # import Levenshtein
- # print(Levenshtein.ratio('助听器','助行器'))
- # print(clean_product_specs("//4008SverssionV10"))
- print(is_legal_brand(getConnect_ots(),"产地:中国品牌:天津迈达型号:ODM-2100S"))
- print(clean_product_brand("产地:中国品牌:天津迈达型号:ODM-2100S"))
- # print(check_specs("500ml","3500ml"))
- # print(is_similar("手术显微镜配套无线工作站(含助手镜)","显微镜",80))
|