productUtils.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363
  1. from BaseDataMaintenance.maintenance.product.product_setting import *
  2. import Levenshtein
  3. import re
  4. # 判断是不是入参字符串为全中文
  5. from BaseDataMaintenance.dataSource.source import getConnect_redis_product
  6. from BaseDataMaintenance.dataSource.pool import ConnectorPool
  7. from BaseDataMaintenance.common.Utils import log
  8. from BaseDataMaintenance.common.documentFingerprint import getMD5
  9. from BaseDataMaintenance.common.milvusUtil import search_embedding
  10. pool_product = ConnectorPool(10,30,getConnect_redis_product)
  11. def get_milvus_search(coll,index_name,name,vector,search_params,output_fields,limit=3):
  12. if name is None or name=="":
  13. return None
  14. db = pool_product.getConnector()
  15. try:
  16. _md5 = getMD5(str(name))+"_milvus"
  17. _search_list = db.get(_md5)
  18. if _search_list is not None:
  19. return json.loads(_search_list)
  20. else:
  21. list_result = []
  22. result = coll.search(vector,index_name,search_params,top_k=limit,output_fields=output_fields,limit=limit)
  23. for hits in result:
  24. for hit in hits:
  25. list_result.append(hit)
  26. final_list = []
  27. for _search in list_result:
  28. _d = {}
  29. for k in output_fields:
  30. _d[k] = _search.entity.get(k)
  31. final_list.append(_d)
  32. db.set(_md5,json.dumps(final_list))
  33. db.expire(_md5,2*60)
  34. return final_list
  35. except Exception as e:
  36. log("getExtract_json_fromRedis error %s"%(str(e)))
  37. raise RuntimeError("get milvus search error")
  38. finally:
  39. try:
  40. if db.connection.check_health():
  41. pool_product.putConnector(db)
  42. except Exception as e:
  43. pass
  44. return None
  45. return list_result
  46. def get_embedding_request(sentence,retry_times=3):
  47. if sentence is None or sentence=="":
  48. return None
  49. db = pool_product.getConnector()
  50. try:
  51. _md5 = getMD5(str(sentence))+"_embedding"
  52. _embedding = db.get(_md5)
  53. if _embedding is not None:
  54. return json.loads(_embedding)
  55. else:
  56. _embedding = request_embedding(sentence,retry_times=retry_times)
  57. if _embedding is not None:
  58. db.set(_md5,json.dumps(_embedding))
  59. return _embedding
  60. except Exception as e:
  61. log("getExtract_json_fromRedis error %s"%(str(e)))
  62. raise RuntimeError("get embedding request error")
  63. finally:
  64. try:
  65. if db.connection.check_health():
  66. pool_product.putConnector(db)
  67. except Exception as e:
  68. pass
  69. return None
  70. def judge_pur_chinese(keyword):
  71. """
  72. 中文字符的编码范围为: u'\u4e00' -- u'\u9fff:只要在此范围内就可以判断为中文字符串
  73. @param keyword:
  74. @return:
  75. """
  76. # 定义一个需要删除的标点符号字符串列表
  77. remove_chars = '[·’!"\#$%&\'()#!()*+,-./:;<=>?\@,:?¥★、….>【】[]《》?“”‘’\[\\]^_`{|}~]+'
  78. # 利用re.sub来删除中文字符串中的标点符号
  79. strings = re.sub(remove_chars, "", keyword) # 将keyword中文字符串中remove_chars中包含的标点符号替换为空字符串
  80. for ch in strings:
  81. if u'\u4e00' <= ch <= u'\u9fff':
  82. pass
  83. else:
  84. return False
  85. return True
  86. def get_chinese_string(string):
  87. list_s = []
  88. for s in re.split("[^\u4e00-\u9fff]",string):
  89. if s!="" and len(s)>=2:
  90. list_s.append(s)
  91. return list_s
  92. def is_area_brand(brand,area_set):
  93. brand = re.sub("[省市区县等]","",brand)
  94. if len(brand)>12:
  95. return 0
  96. for _i in range(2,len(brand)):
  97. ns = brand[:_i]
  98. ns1 = brand[_i:]
  99. if ns in area_set and (ns1 in area_set or ns1==""):
  100. return 2
  101. if ns in area_set and len(brand)-_i<=5 and len(brand)-_i>=2:
  102. return 1
  103. return 0
  104. def jaccard_score(source,target):
  105. source_set = set([s for s in source])
  106. target_set = set([s for s in target])
  107. if len(source_set)==0 or len(target_set)==0:
  108. return 0
  109. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  110. from fuzzywuzzy import fuzz
  111. def is_similar(source,target,_radio=None):
  112. source = str(source).lower()
  113. target = str(target).lower()
  114. max_len = max(len(source),len(target))
  115. min_len = min(len(source),len(target))
  116. min_ratio = 90
  117. if min_len>=3:
  118. min_ratio = 87
  119. if min_len>=5:
  120. min_ratio = 85
  121. if _radio is not None:
  122. min_ratio = _radio
  123. # dis_len = abs(len(source)-len(target))
  124. # min_dis = min(max_len*0.2,4)
  125. if min_len==0 and max_len>0:
  126. return False
  127. if max_len<=2:
  128. if source==target:
  129. return True
  130. if min_len<2:
  131. return False
  132. #判断相似度
  133. similar = fuzz.ratio(source,target)
  134. if similar>=min_ratio:
  135. return True
  136. similar_jaro = Levenshtein.jaro(source,target)
  137. if similar_jaro*100>=min_ratio:
  138. return True
  139. similar_jarow = Levenshtein.jaro_winkler(source,target)
  140. if similar_jarow*100>=90:
  141. return True
  142. if min_len>=5:
  143. if len(source)==max_len and str(source).find(target)>=0:
  144. return True
  145. elif len(target)==max_len and target.find(source)>=0:
  146. return True
  147. elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
  148. return True
  149. # 全中文判断是否包含
  150. if len(source)==max_len and judge_pur_chinese(target):
  151. if str(source).find(target)>=0:
  152. return True
  153. if len(target)==max_len and judge_pur_chinese(source):
  154. if target.find(source)>=0:
  155. return True
  156. return False
  157. def is_contain(source,target,min_len=2):
  158. if len(source)>=len(target) and target in source and len(target)>=min_len:
  159. return True
  160. if len(target)>len(source) and source in target and len(source)>=min_len:
  161. return True
  162. return False
  163. def check_product(source,target):
  164. if is_contain(source,target,min_len=3):
  165. return True
  166. return False
  167. def check_brand(source,target):
  168. source = str(source).lower()
  169. target = str(target).lower()
  170. if is_contain(source,target):
  171. return True
  172. SPECS_CHECK_SET = set([i for i in 'abcdefghijklmnopqrstuvwxyz']) | set([i for i in '0123456789.']) | set([i for i in 'IⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ'])
  173. NOT_SPECS_PATTERN = re.compile("[^%s]"%("".join(list(SPECS_CHECK_SET))))
  174. def has_same_specs_count(source, target):
  175. source = str(source).lower()
  176. target = str(target).lower()
  177. # just take care of type and count,lack of order
  178. dict_source = {}
  179. dict_target = {}
  180. for s in source:
  181. if s in SPECS_CHECK_SET:
  182. if s not in dict_source:
  183. dict_source[s] = 0
  184. dict_source[s] += 1
  185. for s in target:
  186. if s in SPECS_CHECK_SET:
  187. if s not in dict_target:
  188. dict_target[s] = 0
  189. dict_target[s] += 1
  190. union_keys = set(list(dict_source.keys())) & set(list(dict_target.keys()))
  191. if len(dict_source.keys())!= len(union_keys):
  192. return False
  193. for k,v in dict_source.items():
  194. if v!=dict_target.get(k):
  195. return False
  196. return True
  197. SPECS_PATTERN = re.compile("[^A-Za-z0-9-\\/()().]")
  198. def is_legal_specs(specs):
  199. if specs is None or specs=="":
  200. return False
  201. specs = str(specs).lower()
  202. if re.search(SPECS_PATTERN,specs) is not None:
  203. return False
  204. # for s in specs:
  205. # if re.search(SPECS_PATTERN,s) is not None:
  206. # return False
  207. return True
  208. def check_specs(source,target):
  209. '''
  210. check if the source specs is the same as the target
  211. same only if the chars in SPECS_CHECK_SET have the same counts
  212. :param source:
  213. :param target:
  214. :return:
  215. '''
  216. source = str(source).lower()
  217. target = str(target).lower()
  218. source = re.sub(NOT_SPECS_PATTERN,'',source)
  219. target = re.sub(NOT_SPECS_PATTERN,'',target)
  220. if source==target and len(source)>0:
  221. return True
  222. if has_same_specs_count(source,target):
  223. _index = 0
  224. for _i in range(min(len(source),len(target))):
  225. _index = -(_i+1)
  226. if source[_index]!=target[_index]:
  227. break
  228. if abs(_index)>min(len(source),len(target))//2:
  229. return True
  230. return False
  231. import json
  232. import requests
  233. session = requests.Session()
  234. def request_embedding(sentence,retry_times=3):
  235. for _ in range(retry_times):
  236. resp = session.post(embedding_url,json={"sentence":sentence})
  237. if resp.status_code==200:
  238. content = resp.content.decode("utf-8")
  239. _d = json.loads(content)
  240. if _d.get("success"):
  241. return _d.get("vector")
  242. return None
  243. def clean_product_name(product_name):
  244. '''
  245. clean before insert
  246. :param product_name:
  247. :return:
  248. '''
  249. return product_name
  250. def clean_product_brand(product_brand):
  251. '''
  252. clean before insert
  253. :param product_brand:
  254. :return:
  255. '''
  256. _search = re.search("品牌[::;;](?P<brand>.{2,8}?)([.。、;::]|规格|型号|生产厂家|厂家)",product_brand)
  257. if _search is not None:
  258. product_brand = _search.groupdict().get("brand")
  259. brand = re.sub("[/\\,,、.|等]|一批|/无|品牌",'',product_brand)
  260. return brand
  261. def clean_product_specs(product_specs):
  262. '''
  263. clean before insert
  264. :param product_specs:
  265. :return:
  266. '''
  267. _specs = re.sub(SPECS_PATTERN,'',product_specs)
  268. if len(_specs)>0:
  269. return _specs
  270. return product_specs
  271. def clean_product_unit_price(product_unit_price):
  272. '''
  273. clean before insert
  274. :param product_unit_price:
  275. :return:
  276. '''
  277. try:
  278. if product_unit_price is not None and product_unit_price!="":
  279. _price = float(product_unit_price)
  280. return _price
  281. except Exception as e:
  282. return ""
  283. return ""
  284. def clean_product_quantity(product_quantity):
  285. '''
  286. :param product_quantity:
  287. :return:
  288. '''
  289. try:
  290. if product_quantity is not None and product_quantity!="":
  291. _quantity = int(product_quantity)
  292. return _quantity
  293. except Exception as e:
  294. return ""
  295. return ""
  296. if __name__ == '__main__':
  297. print(is_similar('128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10','VolusonE10'))
  298. print(re.split("[^\u4e00-\u9fff]",'128排RevolutionCTES彩色多普勒超声诊断仪VolusonE10'))
  299. import Levenshtein
  300. print(Levenshtein.ratio('助听器','助行器'))
  301. a = "无锡贝尔森品牌"
  302. print(clean_product_brand(a))