productUtils.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. from BaseDataMaintenance.maintenance.product.product_setting import *
  2. import Levenshtein
  3. import re
  4. # 判断是不是入参字符串为全中文
  5. from BaseDataMaintenance.dataSource.source import getConnect_redis_product_pool
  6. from BaseDataMaintenance.dataSource.pool import ConnectorPool
  7. from BaseDataMaintenance.common.Utils import log
  8. from BaseDataMaintenance.common.documentFingerprint import getMD5
  9. from BaseDataMaintenance.common.milvusUtil import search_embedding
  10. import redis
  11. pool_product = getConnect_redis_product_pool()
  12. import traceback
  13. from tablestore import *
  14. from BaseDataMaintenance.model.ots.document_product_dict_interface import *
  15. from BaseDataMaintenance.model.ots.document_product_dict import *
  16. from BaseDataMaintenance.model.ots.document_product_tmp import *
  17. from BaseDataMaintenance.model.ots.enterprise import *
  18. from BaseDataMaintenance.maintenance.product.make_brand_pattern import get_area_set
  19. area_set = get_area_set()
  20. import jieba
  21. ots_client = getConnect_ots()
  22. def get_intellect_search(coll,index_name,name,grade,search_params,output_fields,limit,max_steps=5):
  23. vector = []
  24. v = get_embedding_request(name)
  25. if v is not None:
  26. vector.append(v)
  27. if len(str(name))>=5:
  28. name_cut = list(jieba.cut(name))
  29. strides = [1,2]
  30. for stride in strides:
  31. steps = len(name_cut)//stride
  32. if len(name)%stride>=stride//2+1:
  33. steps += 1
  34. _begin = 0
  35. _name = ""
  36. for i in range(min(steps,max_steps)):
  37. _name += "".join(name_cut[i*stride:(i+1)*stride])
  38. if len(_name)<2:
  39. continue
  40. v = get_embedding_request(_name)
  41. if v is not None:
  42. vector.append(v)
  43. _name = ""
  44. if len(vector)>0:
  45. list_search = get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit)
  46. if list_search:
  47. return list_search
  48. return []
  49. def get_embedding_search(coll,index_name,name,grade,vector,search_params,output_fields,limit=3):
  50. if name is None or name=="":
  51. return None
  52. db = redis.Redis(connection_pool=pool_product)
  53. try:
  54. _md5 = getMD5(str(name))+"_milvus_%d"%(grade)
  55. _search_list = None
  56. try:
  57. _search_list = db.get(_md5)
  58. except Exception as e:
  59. log("get redis data error")
  60. if _search_list is not None:
  61. # log("_search_list is not None")
  62. return json.loads(_search_list)
  63. else:
  64. # log("search from milvus")
  65. list_result = []
  66. result = coll.search(vector,index_name,search_params,top_k=limit,output_fields=output_fields,limit=limit)
  67. for hits in result:
  68. for hit in hits:
  69. list_result.append(hit)
  70. final_list = []
  71. for _search in list_result:
  72. _d = {}
  73. for k in output_fields:
  74. _d[k] = _search.entity.get(k)
  75. final_list.append(_d)
  76. final_list = remove_repeat_item(final_list,k="ots_name")
  77. for _d in final_list:
  78. # _d["length_dis"] = abs(len(_d.get("standard_name",""))-len(name))
  79. standard_set = set(_d.get("standard_name",""))
  80. name_set = set(name)
  81. _d["length_dis"] = len(standard_set&name_set)/max(len(standard_set)+len(name_set),1)
  82. final_list.sort(key=lambda x:x.get("length_dis",0),reverse=True)
  83. final_list.sort(key=lambda x:x.get("level",1))
  84. try:
  85. db.set(_md5,json.dumps(final_list))
  86. db.expire(_md5,PRODUCT_REDIS_CACHE_TIME)
  87. except Exception as e:
  88. traceback.print_exc()
  89. log("set redis data error")
  90. return final_list
  91. except Exception as e:
  92. traceback.print_exc()
  93. raise RuntimeError("get milvus search error")
  94. return None
  95. def remove_repeat_item(list_result,k="standard_name"):
  96. final_list = []
  97. set_k = set()
  98. for item in list_result:
  99. _v = item.get(k)
  100. if _v is not None and _v in set_k:
  101. continue
  102. final_list.append(item)
  103. set_k.add(_v)
  104. return final_list
  105. def get_embedding_request(sentence,retry_times=3):
  106. if sentence is None or sentence=="":
  107. return None
  108. db = redis.Redis(connection_pool=pool_product)
  109. try:
  110. _md5 = getMD5(get_milvus_standard_name(sentence))+"_embedding"
  111. _embedding = None
  112. try:
  113. _embedding = db.get(_md5)
  114. except Exception as e:
  115. log("get redis data error")
  116. if _embedding is not None:
  117. return json.loads(_embedding)
  118. else:
  119. _embedding = request_embedding(sentence,retry_times=retry_times)
  120. if _embedding is not None:
  121. try:
  122. db.set(_md5,json.dumps(_embedding))
  123. db.expire(_md5,60*60)
  124. except Exception as e:
  125. traceback.print_exc()
  126. log("set redis data error")
  127. return _embedding
  128. except Exception as e:
  129. traceback.print_exc()
  130. raise RuntimeError("get embedding request error")
  131. return None
  132. def judge_pur_chinese(keyword):
  133. """
  134. 中文字符的编码范围为: u'\u4e00' -- u'\u9fff:只要在此范围内就可以判断为中文字符串
  135. @param keyword:
  136. @return:
  137. """
  138. # 定义一个需要删除的标点符号字符串列表
  139. remove_chars = '[·’!"\#$%&\'()#!()*+,-./:;<=>?\@,:?¥★、….>【】[]《》?“”‘’\[\\]^_`{|}~]+'
  140. # 利用re.sub来删除中文字符串中的标点符号
  141. strings = re.sub(remove_chars, "", keyword) # 将keyword中文字符串中remove_chars中包含的标点符号替换为空字符串
  142. for ch in strings:
  143. if u'\u4e00' <= ch <= u'\u9fff':
  144. pass
  145. else:
  146. return False
  147. return True
  148. def get_chinese_string(string):
  149. list_s = []
  150. for s in re.split("[^\u4e00-\u9fff]",string):
  151. if s!="" and len(s)>=2:
  152. list_s.append(s)
  153. return list_s
  154. def is_area_brand(brand,area_set):
  155. brand = re.sub("[省市区县等]","",brand)
  156. if len(brand)>12:
  157. return 0
  158. if brand in area_set:
  159. return 2
  160. for _i in range(2,len(brand)):
  161. ns = brand[:_i]
  162. ns1 = brand[_i:]
  163. if ns in area_set and (ns1 in area_set or ns1==""):
  164. return 2
  165. if ns in area_set and len(brand)-_i<=5 and len(brand)-_i>=2:
  166. return 1
  167. return 0
  168. def jaccard_score(source,target):
  169. source_set = set([s for s in source])
  170. target_set = set([s for s in target])
  171. if len(source_set)==0 or len(target_set)==0:
  172. return 0
  173. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  174. from fuzzywuzzy import fuzz
  175. def is_similar(source,target,_radio=None):
  176. source = str(source).lower()
  177. target = str(target).lower()
  178. max_len = max(len(source),len(target))
  179. min_len = min(len(source),len(target))
  180. min_ratio = 90
  181. if min_len>=3:
  182. min_ratio = 87
  183. if min_len>=5:
  184. min_ratio = 85
  185. if _radio is not None:
  186. min_ratio = _radio
  187. # dis_len = abs(len(source)-len(target))
  188. # min_dis = min(max_len*0.2,4)
  189. if min_len==0 and max_len>0:
  190. return False
  191. if max_len<=2:
  192. if source==target:
  193. return True
  194. if min_len<2:
  195. return False
  196. #判断相似度
  197. similar = fuzz.ratio(source,target)
  198. if similar>=min_ratio:
  199. log("%s and %s similar_jaro %d"%(source,target,similar))
  200. return True
  201. similar_jaro = Levenshtein.jaro(source,target)
  202. if similar_jaro*100>=min_ratio:
  203. log("%s and %s similar_jaro %d"%(source,target,similar_jaro*100))
  204. return True
  205. similar_jarow = Levenshtein.jaro_winkler(source,target)
  206. if similar_jarow*100>=min_ratio:
  207. log("%s and %s similar_jaro %d"%(source,target,similar_jarow*100))
  208. return True
  209. if min_len>=5:
  210. if len(source)==max_len and str(source).find(target)>=0:
  211. return True
  212. elif len(target)==max_len and target.find(source)>=0:
  213. return True
  214. elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
  215. return True
  216. return False
  217. def is_contain(source,target,min_len=2):
  218. if len(source)>=len(target) and target in source and len(target)>=min_len:
  219. return True
  220. if len(target)>len(source) and source in target and len(source)>=min_len:
  221. return True
  222. return False
  223. def check_char(source,target,chat_pattern=re.compile("^[a-zA-Z0-9\-]+$"),find_pattern=re.compile("(?P<product>[a-zA-Z0-9-]+)")):
  224. if re.search(chat_pattern,source) is not None or re.search(chat_pattern,target) is not None:
  225. a = set(re.findall(find_pattern,source))
  226. b = set(re.findall(find_pattern,target))
  227. if len(a&b)>0:
  228. return True
  229. else:
  230. return False
  231. def check_product(source,target,remove_words):
  232. if remove_words is not None and remove_words!="":
  233. _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
  234. list_split = [a.strip() for a in _split if a.strip()!=""]
  235. for _s in list_split:
  236. if str(source).find(_s)>=0:
  237. return False
  238. _check = check_char(source,target)
  239. if _check:
  240. return True
  241. else:
  242. if _check==False:
  243. return False
  244. if len(source)>len(target) and target in source:
  245. return True
  246. max_len = max(len(source),len(target))
  247. min_len = min(len(source),len(target))
  248. if min_len<2:
  249. return False
  250. elif max_len<=5:
  251. min_ratio=96
  252. else:
  253. min_ratio = 95
  254. min_ratio = 98
  255. if is_similar(source,target,min_ratio):
  256. return True
  257. return False
  258. def check_brand(source,target,remove_words):
  259. source = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(source).lower())
  260. target = re.sub("省|市|县|集团|股份|有限|责任|公司",'',str(target).lower())
  261. if remove_words is not None and remove_words!="":
  262. _split = remove_words.split(DOCUMENT_PRODUCT_DICT_INTERFACE_STANDARD_ALIAS_SEPARATOR)
  263. list_split = [a.strip() for a in _split if a.strip()!=""]
  264. for _s in _split:
  265. if str(source).find(_s)>=0:
  266. return False
  267. max_len = max(len(source),len(target))
  268. min_len = min(len(source),len(target))
  269. if min_len<2:
  270. return False
  271. elif max_len<=5:
  272. min_ratio=94
  273. else:
  274. min_ratio = 90
  275. min_ratio = 98
  276. source_c = "".join(get_chinese_string(source))
  277. target_c = "".join(get_chinese_string(target))
  278. _check = check_char(source,target)
  279. if _check:
  280. return True
  281. else:
  282. if _check==False:
  283. return False
  284. if len(source_c)>=2 and len(target_c)>=2:
  285. if not(source_c in area_set or target_c in area_set):
  286. if is_contain(source_c,target_c):
  287. return True
  288. if is_similar(source_c,target_c,min_ratio):
  289. return True
  290. else:
  291. return False
  292. if has_same_specs_count(source,target):
  293. if is_contain(source,target):
  294. return True
  295. if is_similar(source,target,min_ratio):
  296. return True
  297. SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789.']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
  298. NOT_SPECS_PATTERN = re.compile("[^%s]"%("".join(list(SPECS_CHECK_SET))))
  299. def has_same_specs_count(source, target):
  300. source = str(source).lower()
  301. target = str(target).lower()
  302. # just take care of type and count,lack of order
  303. dict_source = {}
  304. dict_target = {}
  305. for s in source:
  306. if s in SPECS_CHECK_SET:
  307. if s not in dict_source:
  308. dict_source[s] = 0
  309. dict_source[s] += 1
  310. for s in target:
  311. if s in SPECS_CHECK_SET:
  312. if s not in dict_target:
  313. dict_target[s] = 0
  314. dict_target[s] += 1
  315. union_keys = set(list(dict_source.keys())) & set(list(dict_target.keys()))
  316. if len(dict_source.keys())!= len(union_keys) or len(dict_target.keys())!= len(union_keys):
  317. return False
  318. for k,v in dict_source.items():
  319. if v!=dict_target.get(k):
  320. return False
  321. return True
  322. def is_legal_brand(ots_client,brand):
  323. _search = re.search("品牌[::;;](?P<brand>.{2,8}?)([.。、;::]|规格|型号|生产厂家|厂家)",brand)
  324. if _search is not None:
  325. brand = _search.groupdict().get("brand")
  326. if brand is None or len(brand)<2:
  327. return False
  328. # check whether this brand exists in interface and action is delete
  329. bool_query = BoolQuery(must_queries=[
  330. TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_NAME,brand),
  331. TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE,BRAND_GRADE),
  332. TermQuery(DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION,"delete")
  333. ])
  334. rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_interface_table_name,Document_product_dict_interface_table_name+"_index",
  335. SearchQuery(bool_query,get_total_count=True))
  336. if total_count>0:
  337. return False
  338. # check whether this brand exists in dict and grade=name_grade or grade=specs_grade
  339. bool_query = BoolQuery(must_queries=[
  340. TermQuery(DOCUMENT_PRODUCT_DICT_NAME,brand),
  341. BoolQuery(should_queries=[
  342. TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,NAME_GRADE),
  343. TermQuery(DOCUMENT_PRODUCT_DICT_GRADE,SPECS_GRADE)
  344. ])
  345. ])
  346. rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_dict_table_name,Document_product_dict_table_name+"_index",
  347. SearchQuery(bool_query,get_total_count=True))
  348. if total_count>0:
  349. return False
  350. # check the area+brand type
  351. _f = is_area_brand(brand,area_set)
  352. if _f==1:
  353. log("%s is_legal_brand True by is_area_brand"%(brand))
  354. return True
  355. elif _f==2:
  356. return False
  357. # check the company type
  358. if len(brand)<100 and len(brand)>=8:
  359. _d = {ENTERPRISE_NAME:brand}
  360. _ent = Enterprise(_d)
  361. if _ent.exists_row(ots_client):
  362. _ent.fix_columns(ots_client,[ENTERPRISE_bid_number,ENTERPRISE_STATUS,ENTERPRISE_tyc_id],True)
  363. if _ent.getProperties().get(ENTERPRISE_STATUS,0)>=201 and _ent.getProperties().get(ENTERPRISE_STATUS,0)<=300:
  364. if _ent.getProperties().get(ENTERPRISE_bid_number,0)>0 or _ent.getProperties().get(ENTERPRISE_tyc_id,0):
  365. log("%s is_legal_brand True by Enterprise"%(brand))
  366. return True
  367. # check the group count and char
  368. bool_query = BoolQuery(must_queries=[
  369. TermQuery(DOCUMENT_PRODUCT_TMP_BRAND,brand)
  370. ])
  371. rows,next_token,total_count,is_all_succeed = ots_client.search(Document_product_tmp_table_name,Document_product_tmp_table_name+"_index",
  372. SearchQuery(bool_query,get_total_count=True))
  373. if total_count>=5:
  374. new_brand = re.sub("[^\u4e00-\u9fff]",'',brand)
  375. if re.search("详见|无|国产|null|其他|详细|废标|[0-9/]|品牌|文件|^见",brand) is None and len(brand)<=8:
  376. log("%s is_legal_brand True by count"%(brand))
  377. return True
  378. SPECS_PATTERN = re.compile("[^A-Za-z0-9-\\/()().]")
  379. def is_legal_specs(specs):
  380. if specs is None or specs=="":
  381. return False
  382. specs = str(specs).lower()
  383. if re.search(SPECS_PATTERN,specs) is not None:
  384. return False
  385. # for s in specs:
  386. # if re.search(SPECS_PATTERN,s) is not None:
  387. # return False
  388. return True
  389. def check_specs(source,target):
  390. '''
  391. check if the source specs is the same as the target
  392. same only if the chars in SPECS_CHECK_SET have the same counts
  393. :param source:
  394. :param target:
  395. :return:
  396. '''
  397. source = str(source).lower()
  398. target = str(target).lower()
  399. source = re.sub(NOT_SPECS_PATTERN,'',source)
  400. target = re.sub(NOT_SPECS_PATTERN,'',target)
  401. if source==target and len(source)>0:
  402. return True
  403. if has_same_specs_count(source,target):
  404. _index = 0
  405. for _i in range(min(len(source),len(target))):
  406. _index = -(_i+1)
  407. if source[_index]!=target[_index]:
  408. break
  409. if abs(_index)>min(len(source),len(target))//2:
  410. return True
  411. return False
  412. import json
  413. import requests
  414. session = requests.Session()
  415. def request_embedding(sentence,retry_times=3):
  416. for _ in range(retry_times):
  417. sentence = get_milvus_standard_name(sentence)
  418. resp = session.post(embedding_url,json={"sentence":sentence})
  419. if resp.status_code==200:
  420. content = resp.content.decode("utf-8")
  421. _d = json.loads(content)
  422. if _d.get("success"):
  423. return _d.get("vector")
  424. return None
  425. def clean_product_name(product_name):
  426. '''
  427. clean before insert
  428. :param product_name:
  429. :return:
  430. '''
  431. return product_name
  432. def clean_product_brand(product_brand):
  433. '''
  434. clean before insert
  435. :param product_brand:
  436. :return:
  437. '''
  438. _search = re.search("品牌[::;;](?P<brand>.{2,8}?)([.。、;::]|规格|型号|生产厂家|厂家)",product_brand)
  439. if _search is not None:
  440. product_brand = _search.groupdict().get("brand")
  441. brand = re.sub("[/\\,,、.|等]|一批|/无|品牌|^[/.]+",'',product_brand)
  442. for i in range(min(len(brand)-2,8)):
  443. _n = brand[:i+1]
  444. if _n in area_set:
  445. n_name = re.sub("^[省市区]]",'',brand[i+1:])
  446. face_id = get_document_product_dict_interface_base_id(n_name,BRAND_GRADE)
  447. _interface_d = {
  448. DOCUMENT_PRODUCT_DICT_INTERFACE_ID:face_id
  449. }
  450. _dpdi = Document_product_dict_interface(_interface_d)
  451. if _dpdi.exists_row(ots_client):
  452. brand = n_name
  453. break
  454. return brand
  455. def clean_product_specs(product_specs,_PATTERN = re.compile("[^A-Za-z0-9-\\/()().×*]|^[\\/.-]+")):
  456. '''
  457. clean before insert
  458. :param product_specs:
  459. :return:
  460. '''
  461. _specs = re.sub(_PATTERN,'',product_specs)
  462. if len(_specs)>0:
  463. return _specs
  464. return product_specs
  465. def clean_product_unit_price(product_unit_price):
  466. '''
  467. clean before insert
  468. :param product_unit_price:
  469. :return:
  470. '''
  471. try:
  472. if product_unit_price is not None and product_unit_price!="":
  473. _price = float(product_unit_price)
  474. return _price
  475. except Exception as e:
  476. return ""
  477. return ""
  478. def clean_product_quantity(product_quantity):
  479. '''
  480. :param product_quantity:
  481. :return:
  482. '''
  483. try:
  484. if product_quantity is not None and product_quantity!="":
  485. _quantity = int(product_quantity)
  486. return _quantity
  487. except Exception as e:
  488. return ""
  489. return ""
  490. if __name__ == '__main__':
  491. # print(check_brand('DYW-JY-T01-A1(定制)','JY',''))
  492. # print(check_product("医用冷藏箱","医用","a|"))
  493. # print(re.split("[^\u4e00-\u9fff]",'128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10'))
  494. # import Levenshtein
  495. # print(Levenshtein.ratio('助听器','助行器'))
  496. # print(clean_product_specs("//4008SverssionV10"))
  497. print(is_legal_brand(getConnect_ots(),"产地:中国品牌:天津迈达型号:ODM-2100S"))
  498. print(clean_product_brand("产地:中国品牌:天津迈达型号:ODM-2100S"))
  499. # print(check_specs("500ml","3500ml"))
  500. # print(is_similar("手术显微镜配套无线工作站(含助手镜)","显微镜",80))