luojiehua
/
BaseDataMaintenance


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
							
from BaseDataMaintenance.maintenance.product.product_setting import *


import Levenshtein
import re
# 判断是不是入参字符串为全中文

from BaseDataMaintenance.dataSource.source import getConnect_redis_product_pool
from BaseDataMaintenance.dataSource.pool import ConnectorPool
from BaseDataMaintenance.common.Utils import log
from BaseDataMaintenance.common.documentFingerprint import getMD5
from BaseDataMaintenance.common.milvusUtil import search_embedding


import redis
pool_product = getConnect_redis_product_pool()
import traceback
from tablestore import *

from BaseDataMaintenance.model.ots.document_product_dict_interface import *
from BaseDataMaintenance.model.ots.document_product_dict import *
from BaseDataMaintenance.model.ots.document_product_tmp import *
from BaseDataMaintenance.model.ots.enterprise import *
from BaseDataMaintenance.maintenance.product.make_brand_pattern import get_area_set

area_set = get_area_set()
import jieba

ots_client = getConnect_ots()
def get_intellect_search(coll,index_name,name,grade,search_params,output_fields,limit,max_steps=5):

    vector = []
    v = get_embedding_request(name)
    if v is not None:
        vector.append(v)
    if len(str(name))>=5:
        name_cut = list(jieba.cut(name))
        strides = [1,2]
        for stride in strides:
            steps = len(name_cut)//stride
            if len(name)%stride>=stride//2+1:
                steps += 1
            _begin = 0
            _name = ""
            for i in range(min(steps,max_steps)):
                _name += "".join(name_cut[i*stride:(i+1)*stride])
                if len(_name)<2:
                    continue
                v = get_embedding_request(_name)
                if v is not None:
                    vector.append(v)
                _name = ""

    if len(vector)>0:
        list_search = get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit)
        if list_search:
            return list_search

    return []


def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit=3):

    if name is None or name=="":
        return None
    db = redis.Redis(connection_pool=pool_product)
    try:
        _md5 = getMD5(str(name))+"_milvus_%d"%(grade)
        _search_list = None
        try:
            _search_list = db.get(_md5)
        except Exception as e:
            log("get redis data error")
        if _search_list is not None:
            # log("_search_list is not None")
            return json.loads(_search_list)
        else:
            # log("search from milvus")
            list_result = []
            result = coll.search(vector,index_name,search_params,top_k=limit,output_fields=output_fields,limit=limit)
            for hits in result:
                for hit in hits:
                    list_result.append(hit)

            final_list = []
            for _search in list_result:
                _d = {}
                for k in output_fields:
                    _d[k] = _search.entity.get(k)
                final_list.append(_d)
            final_list = remove_repeat_item(final_list,k="ots_name")
            for _d in final_list:
                # _d["length_dis"] = abs(len(_d.get("standard_name",""))-len(name))
                standard_set = set(_d.get("standard_name",""))
                name_set = set(name)
                _d["length_dis"] = len(standard_set&name_set)/max(len(standard_set)+len(name_set),1)
            final_list.sort(key=lambda x:x.get("length_dis",0),reverse=True)
            final_list.sort(key=lambda x:x.get("level",1))
            try:
                db.set(_md5,json.dumps(final_list))
                db.expire(_md5,PRODUCT_REDIS_CACHE_TIME)
            except Exception as e:
                traceback.print_exc()
                log("set redis data error")
            return final_list

    except Exception as e:
        traceback.print_exc()
        raise RuntimeError("get milvus search error")
    return None


def remove_repeat_item(list_result,k="standard_name"):
    final_list = []
    set_k = set()
    for item in list_result:
        _v = item.get(k)
        if _v is not None and _v in set_k:
            continue
        final_list.append(item)
        set_k.add(_v)
    return final_list

def get_embedding_request(sentence,retry_times=3):

    if sentence is None or sentence=="":
        return None
    db = redis.Redis(connection_pool=pool_product)

    try:
        _md5 = getMD5(get_milvus_standard_name(sentence))+"_embedding"
        _embedding = None
        try:
            _embedding = db.get(_md5)
        except Exception as e:
            log("get redis data error")
        if _embedding is not None:
            return json.loads(_embedding)
        else:
            _embedding = request_embedding(sentence,retry_times=retry_times)
            if _embedding is not None:
                try:
                    db.set(_md5,json.dumps(_embedding))
                    db.expire(_md5,60*60)
                except Exception as e:
                    traceback.print_exc()
                    log("set redis data error")
            return _embedding
    except Exception as e:
        traceback.print_exc()
        raise RuntimeError("get embedding request error")
    return None


def judge_pur_chinese(keyword):
    """
    中文字符的编码范围为： u'\u4e00' -- u'\u9fff：只要在此范围内就可以判断为中文字符串
    @param keyword:
    @return:
    """
    # 定义一个需要删除的标点符号字符串列表
    remove_chars = '[·’!"\#$%&\'()＃！（）*+,-./:;<=>?\@，：?￥★、…．＞【】［］《》？“”‘’\[\\]^_`{|}~]+'
    # 利用re.sub来删除中文字符串中的标点符号
    strings = re.sub(remove_chars, "", keyword)  # 将keyword中文字符串中remove_chars中包含的标点符号替换为空字符串
    for ch in strings:
        if u'\u4e00' <= ch <= u'\u9fff':
            pass
        else:
            return False
    return True

def get_chinese_string(string):
    list_s = []
    for s in re.split("[^\u4e00-\u9fff]",string):
        if s!="" and len(s)>=2:
            list_s.append(s)
    return list_s

def is_area_brand(brand,area_set):
    brand = re.sub("[省市区县等]","",brand)
    if len(brand)>12:
        return 0
    if brand in area_set:
        return 2
    for _i in range(2,len(brand)):
        ns = brand[:_i]
        ns1 = brand[_i:]
        if ns in area_set and (ns1 in area_set or ns1==""):
            return 2
        if ns in area_set and len(brand)-_i<=5 and len(brand)-_i>=2:
            return 1
    return 0


def jaccard_score(source,target):
    source_set = set([s for s in source])
    target_set = set([s for s in target])
    if len(source_set)==0 or len(target_set)==0:
        return 0
    return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))


from fuzzywuzzy import fuzz
def is_similar(source,target,_radio=None):
    source = str(source).lower()
    target = str(target).lower()
    max_len = max(len(source),len(target))
    min_len = min(len(source),len(target))

    min_ratio = 90
    if min_len>=3:
        min_ratio = 87
    if min_len>=5:
        min_ratio = 85
    if _radio is not None:
        min_ratio = _radio
    # dis_len = abs(len(source)-len(target))
    # min_dis = min(max_len*0.2,4)
    if min_len==0 and max_len>0:
        return False
    if max_len<=2:
        if source==target:
            return True
    if min_len<2:
        return False
    #判断相似度
    similar = fuzz.ratio(source,target)
    if similar>=min_ratio:
        log("%s and %s similar_jaro %d"%(source,target,similar))
        return True
    similar_jaro = Levenshtein.jaro(source,target)
    if similar_jaro*100>=min_ratio:
        log("%s and %s similar_jaro %d"%(source,target,similar_jaro*100))
        return True
    similar_jarow = Levenshtein.jaro_winkler(source,target)
    if similar_jarow*100>=min_ratio:
        log("%s and %s similar_jaro %d"%(source,target,similar_jarow*100))
        return True

    if min_len>=5:
        if len(source)==max_len and str(source).find(target)>=0:
                return True
        elif len(target)==max_len and target.find(source)>=0:
                return True
        elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
            return True
    return False


def is_contain(source,target,min_len=2):
    if len(source)>=len(target) and target in source and len(target)>=min_len:
        return True
    if len(target)>len(source) and source in target and len(source)>=min_len:
        return True
    return False

def check_char(source,target,chat_pattern=re.compile("^[a-zA-Z0-9\-]+$"),find_pattern=re.compile("(?P<product>[a-zA-Z0-9-]+)")):
    if re.search(chat_pattern,source) is not None or re.search(chat_pattern,target) is not None:
        a = set(re.findall(find_pattern,source))
        b = set(re.findall(find_pattern,target))
        if len(a&b)>0:
            return True
        else:
            return False

def check_product(source,target,remove_words):

    if remove_words is not None and remove_words!="":
        _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
        list_split = [a.strip() for a in _split if a.strip()!=""]
        for _s in list_split:
            if str(source).find(_s)>=0:
                return False

    _check = check_char(source,target)
    if _check:
        return True
    else:
        if _check==False:
            return False

    if len(source)>len(target) and target in source:
        return True

    max_len = max(len(source),len(target))
    min_len = min(len(source),len(target))

    if min_len<2:
        return False
    elif max_len<=5:
        min_ratio=96
    else:
        min_ratio = 95
    min_ratio = 98
    if is_similar(source,target,min_ratio):
        return True
    return False


def check_brand(source,target,remove_words):


    source = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(source).lower())
    target = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(target).lower())

    if remove_words is not None and remove_words!="":
        _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
        list_split = [a.strip() for a in _split if a.strip()!=""]
        for _s in _split:
            if str(source).find(_s)>=0:
                return False


    max_len = max(len(source),len(target))
    min_len = min(len(source),len(target))


    if min_len<2:
        return False
    elif max_len<=5:
        min_ratio=94
    else:
        min_ratio = 90
    min_ratio = 98

    source_c = "".join(get_chinese_string(source))
    target_c = "".join(get_chinese_string(target))

    _check = check_char(source,target)
    if _check:
        return True
    else:
        if _check==False:
            return False

    if len(source_c)>=2 and len(target_c)>=2:
        if not(source_c in area_set or target_c in area_set):
            if is_contain(source_c,target_c):
                return True

            if is_similar(source_c,target_c,min_ratio):
                return True
        else:
            return False

    if has_same_specs_count(source,target):

        if is_contain(source,target):
            return True
        if is_similar(source,target,min_ratio):
            return True


SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789.']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
NOT_SPECS_PATTERN = re.compile("[^%s]"%("".join(list(SPECS_CHECK_SET))))

def has_same_specs_count(source, target):

    source = str(source).lower()
    target = str(target).lower()

    # just take care of type and count,lack of order
    dict_source = {}
    dict_target = {}
    for s in source:
        if s in SPECS_CHECK_SET:
            if s not in dict_source:
                dict_source[s] = 0
            dict_source[s] += 1
    for s in target:
        if s in SPECS_CHECK_SET:
            if s not in dict_target:
                dict_target[s] = 0
            dict_target[s] += 1
    union_keys = set(list(dict_source.keys())) & set(list(dict_target.keys()))
    if len(dict_source.keys())!= len(union_keys) or len(dict_target.keys())!= len(union_keys):
        return False
    for k,v in dict_source.items():
        if v!=dict_target.get(k):
            return False

    return True

def is_legal_brand(ots_client,brand):
    _search = re.search("品牌[:：；;](?P<brand>.{2,8}?)([.。、；：:]|规格|型号|生产厂家|厂家)",brand)
    if _search is not None:
        brand = _search.groupdict().get("brand")
    if brand is None or len(brand)<2:
        return False
    # check whether this brand exists in interface and action is delete
    bool_query = BoolQuery(must_queries=[
        TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_NAME,brand),
        TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE,BRAND_GRADE),
        TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION,"delete")
    ])

    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
                                                                        SearchQuery(bool_query,get_total_count=True))
    if total_count>0:
        return False

    # check whether this brand exists in dict and grade=name_grade or grade=specs_grade
    bool_query = BoolQuery(must_queries=[
        TermQuery(DOCUMENT_PRODUCT_DICT_NAME,brand),
        BoolQuery(should_queries=[
            TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,NAME_GRADE),
            TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,SPECS_GRADE)
        ])

    ])
    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
                                                                        SearchQuery(bool_query,get_total_count=True))
    if total_count>0:
        return False

    # check the area+brand type
    _f = is_area_brand(brand,area_set)
    if _f==1:
        log("%s is_legal_brand True by is_area_brand"%(brand))
        return True
    elif _f==2:
        return False

    # check the company type
    if len(brand)<100 and len(brand)>=8:
        _d = {ENTERPRISE_NAME:brand}
        _ent = Enterprise(_d)
        if _ent.exists_row(ots_client):
            _ent.fix_columns(ots_client,[ENTERPRISE_bid_number,ENTERPRISE_STATUS,ENTERPRISE_tyc_id],True)
            if _ent.getProperties().get(ENTERPRISE_STATUS,0)>=201 and _ent.getProperties().get(ENTERPRISE_STATUS,0)<=300:
                if _ent.getProperties().get(ENTERPRISE_bid_number,0)>0 or _ent.getProperties().get(ENTERPRISE_tyc_id,0):
                    log("%s is_legal_brand True by Enterprise"%(brand))
                    return True

    # check the group count and char
    bool_query = BoolQuery(must_queries=[
        TermQuery(DOCUMENT_PRODUCT_TMP_BRAND,brand)
    ])
    rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_tmp_table_name,Document_product_tmp_table_name+"_index",
                                                                        SearchQuery(bool_query,get_total_count=True))

    if total_count>=5:
        new_brand = re.sub("[^\u4e00-\u9fff]",'',brand)
        if re.search("详见|无|国产|null|其他|详细|废标|[0-9/]|品牌|文件|^见",brand) is None and len(brand)<=8:
            log("%s is_legal_brand True by count"%(brand))
            return True

SPECS_PATTERN = re.compile("[^A-Za-z0-9-\\/()（）.]")
def is_legal_specs(specs):
    if specs is None or specs=="":
        return False
    specs = str(specs).lower()
    if re.search(SPECS_PATTERN,specs) is not None:
        return False
    # for s in specs:
    #     if re.search(SPECS_PATTERN,s) is not None:
    #         return False
    return True


def check_specs(source,target):
    '''
    check if the source specs is the same as the target
    same only if the chars in SPECS_CHECK_SET have the same counts
    :param source:
    :param target:
    :return:
    '''
    source = str(source).lower()
    target = str(target).lower()

    source = re.sub(NOT_SPECS_PATTERN,'',source)
    target = re.sub(NOT_SPECS_PATTERN,'',target)

    if source==target and len(source)>0:
        return True

    if has_same_specs_count(source,target):
        _index = 0
        for _i in range(min(len(source),len(target))):
            _index = -(_i+1)
            if source[_index]!=target[_index]:
                break
        if abs(_index)>min(len(source),len(target))//2:
            return True

    return False


import json

import requests
session = requests.Session()
def request_embedding(sentence,retry_times=3):
    for _ in range(retry_times):
        sentence = get_milvus_standard_name(sentence)
        resp = session.post(embedding_url,json={"sentence":sentence})
        if resp.status_code==200:
            content = resp.content.decode("utf-8")
            _d = json.loads(content)
            if _d.get("success"):
                return _d.get("vector")
    return None

def clean_product_name(product_name):
    '''
    clean before insert
    :param product_name:
    :return:
    '''
    return product_name

def clean_product_brand(product_brand):
    '''
    clean before insert
    :param product_brand:
    :return:
    '''
    _search = re.search("品牌[:：；;](?P<brand>.{2,8}?)([.。、；：:]|规格|型号|生产厂家|厂家)",product_brand)
    if _search is not None:
        product_brand = _search.groupdict().get("brand")
    brand = re.sub("[/\\,，、.|等]|一批|/无|品牌|^[/.]+",'',product_brand)
    for i in range(min(len(brand)-2,8)):
        _n = brand[:i+1]
        if _n in area_set:
            n_name = re.sub("^[省市区]]",'',brand[i+1:])
            face_id = get_document_product_dict_interface_base_id(n_name,BRAND_GRADE)
            _interface_d = {
                DOCUMENT_PRODUCT_DICT_INTERFACE_ID:face_id
            }
            _dpdi = Document_product_dict_interface(_interface_d)
            if _dpdi.exists_row(ots_client):
                brand = n_name
                break
    return brand


def clean_product_specs(product_specs,_PATTERN = re.compile("[^A-Za-z0-9-\\/()（）.×*]|^[\\/.-]+")):
    '''
    clean before insert
    :param product_specs:
    :return:
    '''
    _specs = re.sub(_PATTERN,'',product_specs)
    if len(_specs)>0:
        return _specs
    return product_specs


def clean_product_unit_price(product_unit_price):
    '''
    clean before insert
    :param product_unit_price:
    :return:
    '''
    try:
        if product_unit_price is not None and product_unit_price!="":
            _price = float(product_unit_price)
            return _price
    except Exception as e:
        return ""
    return ""


def clean_product_quantity(product_quantity):
    '''

    :param product_quantity:
    :return:
    '''
    try:
        if product_quantity is not None and product_quantity!="":
            _quantity = int(product_quantity)
            return _quantity

    except Exception as e:
        return ""
    return ""

if __name__ == '__main__':
    # print(check_brand('DYW-JY-T01-A1（定制）','JY',''))
    # print(check_product("医用冷藏箱","医用","a|"))

    # print(re.split("[^\u4e00-\u9fff]",'128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10'))
    # import Levenshtein
    # print(Levenshtein.ratio('助听器','助行器'))
    # print(clean_product_specs("//4008SverssionV10"))
    print(is_legal_brand(getConnect_ots(),"产地：中国品牌：天津迈达型号：ODM-2100S"))
    print(clean_product_brand("产地：中国品牌：天津迈达型号：ODM-2100S"))
    # print(check_specs("500ml","3500ml"))
    # print(is_similar("手术显微镜配套无线工作站（含助手镜）","显微镜",80))