products.py 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006
  1. from BaseDataMaintenance.common.documentFingerprint import getMD5
  2. from BaseDataMaintenance.common.Utils import *
  3. from BaseDataMaintenance.common.milvusUtil import *
  4. from BaseDataMaintenance.common.multiThread import MultiThreadHandler
  5. from BaseDataMaintenance.maintenance.product.productUtils import *
  6. from BaseDataMaintenance.model.ots.document_product_tmp import *
  7. from BaseDataMaintenance.model.ots.document_product import *
  8. from BaseDataMaintenance.model.ots.document_product_dict import *
  9. from BaseDataMaintenance.model.ots.document_product_dict_interface import *
  10. from BaseDataMaintenance.model.ots.document import *
  11. from BaseDataMaintenance.model.ots.attachment import *
  12. from BaseDataMaintenance.model.ots.enterprise import *
  13. from tablestore import *
  14. from BaseDataMaintenance.dataSource.source import getConnect_ots
  15. from multiprocessing import Process,Queue
  16. from random import randint
  17. from BaseDataMaintenance.maintenance.product.product_dict import Product_Dict_Manager
  18. from apscheduler.schedulers.blocking import BlockingScheduler
  19. from BaseDataMaintenance.maintenance.product.make_brand_pattern import *
  20. from BaseDataMaintenance.maintenance.product.product_dict import IS_SYNCHONIZED
  21. import logging
  22. root = logging.getLogger()
  23. root.setLevel(logging.INFO)
  24. from uuid import uuid4
  25. class Product_Manager(Product_Dict_Manager):
  26. def __init__(self):
  27. super(Product_Manager, self).__init__()
  28. self.process_queue = Queue()
  29. self.ots_client = getConnect_ots()
  30. self.set_id = set()
  31. def get_product_id(self,docid,name,brand,specs,unit_price,quantity):
  32. if name is None:
  33. name = ""
  34. if brand is None:
  35. brand = ""
  36. if specs is None:
  37. specs = ""
  38. if quantity is None:
  39. quantity = ""
  40. if unit_price is None or unit_price=="":
  41. unit_price = ""
  42. else:
  43. unit_price = "%.2f"%float(unit_price)
  44. product_id = getMD5(str(docid)+str(name)+str(brand)+str(specs)+str(unit_price)+str(quantity))
  45. return product_id
  46. def producer(self,process_count=3000):
  47. q_size = self.process_queue.qsize()
  48. if q_size>process_count/6:
  49. return
  50. bool_query = BoolQuery(must_queries=[RangeQuery(DOCUMENT_PRODUCT_TMP_STATUS,1,51)])
  51. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product_temp","document_product_temp_index",
  52. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
  53. columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
  54. list_data = getRow_ots(rows)
  55. _count = len(list_data)
  56. list_id = []
  57. for _d in list_data:
  58. _id = _d.get(DOCUMENT_PRODUCT_TMP_ID)
  59. if _id in self.set_id:
  60. continue
  61. list_id.append(_id)
  62. self.process_queue.put(_d)
  63. while next_token:
  64. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product_temp","document_product_temp_index",
  65. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  66. columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
  67. list_data = getRow_ots(rows)
  68. for _d in list_data:
  69. _id = _d.get(DOCUMENT_PRODUCT_TMP_ID)
  70. if _id in self.set_id:
  71. continue
  72. list_id.append(_id)
  73. self.process_queue.put(_d)
  74. _count += len(list_data)
  75. if _count>=process_count:
  76. break
  77. self.set_id = set(list_id)
  78. def comsumer(self):
  79. def start_thread(thread_count):
  80. mt = MultiThreadHandler(self.process_queue,self.comsumer_handle,None,thread_count,1,False,True)
  81. mt.run()
  82. process_count = 4
  83. thread_count = 10
  84. list_process = []
  85. for _i in range(process_count):
  86. p = Process(target=start_thread,args=(thread_count,))
  87. list_process.append(p)
  88. for p in list_process:
  89. p.start()
  90. for p in list_process:
  91. p.join()
  92. def comsumer_handle(self,item,result_queue):
  93. self.standardize(item)
  94. def standardize(self,tmp_dict,output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]):
  95. '''
  96. Standardizes the product data
  97. 通过匹配标准参数表进行标准化,匹配是非精确匹配,校验规则是?
  98. :return:
  99. only save the standard product
  100. one temp data is regard as standard product onli if match the name,contition on this,
  101. if the brand is matched: if can be standard then change else add new brand ;if not matched replace as ""
  102. and the same as specs
  103. auto add the connection of name-brand and brand-specs because the 3 degree tree structure
  104. '''
  105. # todo:1. 产品参数表自动添加新的数据? 1. add new contections between existing names.2. add new specs
  106. # 型号在进行匹配时要求差异字符串不能包含数字和字母和罗马数字,且不能忽略出现次数差异
  107. save_product_tmp = Document_product_tmp({DOCUMENT_PRODUCT_TMP_ID:tmp_dict.get(DOCUMENT_PRODUCT_TMP_ID)})
  108. _status = 0
  109. document_product_tmp = Document_product_tmp(tmp_dict)
  110. name = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,"")
  111. brand = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,"")
  112. specs = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,"")
  113. parameters = document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_PARAMETER,"")
  114. list_candidates = [a for a in [name,brand,specs,parameters] if a!=""]
  115. if brand=="" and parameters!="":
  116. brand = parameters
  117. if specs=="" and parameters!="":
  118. specs = parameters
  119. new_name = ""
  120. new_brand = ""
  121. new_specs = ""
  122. name_ots_id = None
  123. brand_ots_id = None
  124. specs_ots_id = None
  125. if name is not None and name!="":
  126. name_vector = get_embedding_request(name)
  127. if name_vector is not None:
  128. Coll,_ = self.get_collection(NAME_GRADE)
  129. search_list = get_embedding_search(Coll,embedding_index_name,name,NAME_GRADE,[name_vector],self.search_params,output_fields,limit=60)
  130. for _search in search_list:
  131. ots_id = _search.get("standard_name_id")
  132. ots_name = _search.get("standard_name")
  133. ots_parent_id = _search.get("ots_parent_id")
  134. if is_similar(name,ots_name) or check_product(name,ots_name):
  135. name_ots_id = ots_id
  136. new_name = ots_name
  137. # #update alias of name
  138. # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
  139. # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
  140. # if _flag and _dpd.updateAlias(name):
  141. # _dpd.update_row(self.ots_client)
  142. break
  143. if name_ots_id is None:
  144. for name in list_candidates:
  145. name_vector = get_embedding_request(name)
  146. if name_vector is not None:
  147. Coll,_ = self.get_collection(NAME_GRADE)
  148. search_list = get_embedding_search(Coll,embedding_index_name,name,NAME_GRADE,[name_vector],self.search_params,output_fields,limit=20)
  149. for _search in search_list:
  150. ots_id = _search.get("standard_name_id")
  151. ots_name = _search.get("standard_name")
  152. ots_parent_id = _search.get("ots_parent_id")
  153. if is_similar(name,ots_name) or check_product(name,ots_name):
  154. name_ots_id = ots_id
  155. new_name = ots_name
  156. # #update alias of name
  157. # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:name_ots_id})
  158. # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
  159. # if _flag and _dpd.updateAlias(name):
  160. # _dpd.update_row(self.ots_client)
  161. break
  162. if name_ots_id is not None:
  163. if brand is not None and brand!="":
  164. s_brand = brand
  165. l_brand = [brand]
  166. l_brand.append(clean_product_brand(s_brand))
  167. brand_ch = get_chinese_string(brand)
  168. l_brand.extend(brand_ch)
  169. _find = False
  170. for brand in l_brand:
  171. brand_vector = get_embedding_request(brand)
  172. if brand_vector is not None:
  173. Coll,_ = self.get_collection(BRAND_GRADE)
  174. search_list = get_embedding_search(Coll,embedding_index_name,brand,BRAND_GRADE,[brand_vector],self.search_params,output_fields,limit=60)
  175. # log("search brand %s"%(brand))
  176. for _search in search_list:
  177. ots_id = _search.get("standard_name_id")
  178. ots_name = _search.get("standard_name")
  179. ots_parent_id = _search.get("ots_parent_id")
  180. # log("check brand %s and %s"%(brand,ots_name))
  181. if is_similar(brand,ots_name) or check_brand(brand,ots_name):
  182. # log("check brand similar succeed:%s and %s"%(brand,ots_name))
  183. if ots_name==new_name:
  184. continue
  185. new_brand = ots_name
  186. log("checking brand %s succeed %s"%(brand,new_brand))
  187. # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
  188. if name_ots_id is not None:
  189. brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
  190. _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
  191. DOCUMENT_PRODUCT_DICT_NAME:new_brand,
  192. DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
  193. DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
  194. DOCUMENT_PRODUCT_DICT_STATUS:1,
  195. DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
  196. DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
  197. DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  198. DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  199. }
  200. _dpd_brand = Document_product_dict(_d_brand)
  201. # _dpd_brand.updateAlias(str(new_brand).lower())
  202. if not _dpd_brand.exists_row(self.ots_client):
  203. _dpd_brand.update_row(self.ots_client)
  204. else:
  205. pass
  206. # #update alias
  207. # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
  208. # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
  209. # if _flag:
  210. # if _dpd.updateAlias(brand):
  211. # _dpd.update_row(self.ots_client)
  212. _find = True
  213. break
  214. else:
  215. # log("check brand similar failed:%s and %s"%(brand,ots_name))
  216. # add new brand?
  217. pass
  218. if _find:
  219. break
  220. if not _find:
  221. for brand in l_brand:
  222. if self.check_new_brand(brand):
  223. new_brand = clean_product_brand(brand)
  224. if new_brand=="":
  225. continue
  226. log("adding new brand %s"%(str(new_brand)))
  227. _d_brand = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
  228. DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_brand,
  229. DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(str(brand).lower()),
  230. DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:BRAND_GRADE,
  231. DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
  232. DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:name_ots_id,
  233. DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  234. DOCUMENT_PRODUCT_DICT_INTERFACE_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  235. DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"insert"
  236. }
  237. dpdi = Document_product_dict_interface(_d_brand)
  238. dpdi.update_row(self.ots_client)
  239. break
  240. if brand_ots_id is None:
  241. _find = False
  242. for brand in list_candidates:
  243. if _find:
  244. break
  245. l_brand = [brand]
  246. l_brand.append(clean_product_brand(brand))
  247. brand_ch = get_chinese_string(brand)
  248. l_brand.extend(brand_ch)
  249. for brand in l_brand:
  250. if _find:
  251. break
  252. start_time = time.time()
  253. # brand_vector = request_embedding(brand)
  254. brand_vector = get_embedding_request(brand)
  255. debug("get embedding for brand %s takes %.4fs"%(brand,time.time()-start_time))
  256. if brand_vector is not None:
  257. Coll,_ = self.get_collection(BRAND_GRADE)
  258. start_time = time.time()
  259. # search_list = search_embedding(Coll,embedding_index_name,[brand_vector],self.search_params,output_fields,limit=10)
  260. search_list = get_embedding_search(Coll,embedding_index_name,brand,BRAND_GRADE,[brand_vector],self.search_params,output_fields,limit=10)
  261. debug("get search_list for brand %s takes %.4fs"%(brand,time.time()-start_time))
  262. # log("search brand %s"%(brand))
  263. for _search in search_list:
  264. ots_id = _search.get("standard_name_id")
  265. ots_name = _search.get("standard_name")
  266. ots_parent_id = _search.get("ots_parent_id")
  267. # log("check brand %s and %s"%(brand,ots_name))
  268. if is_similar(brand,ots_name,_radio=95) or check_brand(brand,ots_name):
  269. # log("check brand similar succeed:%s and %s"%(brand,ots_name))
  270. if ots_name==new_name:
  271. continue
  272. new_brand = ots_name
  273. log("checking brand %s succeed %s"%(brand,new_brand))
  274. # judge if the brand which parent_id is name_ots_id exists,if not insert one else update alias
  275. if name_ots_id is not None:
  276. brand_ots_id = get_document_product_dict_id(name_ots_id,new_brand)
  277. _d_brand = {DOCUMENT_PRODUCT_DICT_ID:brand_ots_id,
  278. DOCUMENT_PRODUCT_DICT_NAME:new_brand,
  279. DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(brand).lower()),
  280. DOCUMENT_PRODUCT_DICT_GRADE:BRAND_GRADE,
  281. DOCUMENT_PRODUCT_DICT_STATUS:1,
  282. DOCUMENT_PRODUCT_DICT_PARENT_ID:name_ots_id,
  283. DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
  284. DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  285. DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  286. }
  287. _dpd_brand = Document_product_dict(_d_brand)
  288. # _dpd_brand.updateAlias(str(new_brand).lower())
  289. if not _dpd_brand.exists_row(self.ots_client):
  290. _dpd_brand.update_row(self.ots_client)
  291. else:
  292. pass
  293. # #update alias
  294. # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:brand_ots_id})
  295. # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
  296. # if _flag:
  297. # if _dpd.updateAlias(brand):
  298. # _dpd.update_row(self.ots_client)
  299. _find = True
  300. break
  301. if specs is not None and specs!="":
  302. debug("getting sepcs %s"%(specs))
  303. list_specs = []
  304. c_specs = clean_product_specs(specs)
  305. list_specs.append(c_specs)
  306. for s in re.split("[\u4e00-\u9fff]",specs):
  307. if s!="" and len(s)>4:
  308. list_specs.append(s)
  309. similar_flag = None
  310. _index = 0
  311. break_flag = False
  312. for c_specs in list_specs:
  313. if break_flag:
  314. break
  315. _index += 1
  316. specs_vector = get_embedding_request(c_specs)
  317. if specs_vector is not None:
  318. Coll,_ = self.get_collection(SPECS_GRADE)
  319. search_list = get_embedding_search(Coll,embedding_index_name,c_specs,SPECS_GRADE,[specs_vector],self.search_params,output_fields,limit=60)
  320. for _search in search_list:
  321. ots_id = _search.get("standard_name_id")
  322. ots_name = _search.get("standard_name")
  323. ots_parent_id = _search.get("ots_parent_id")
  324. debug("checking specs %s and %s"%(specs,ots_name))
  325. if is_similar(specs,ots_name):
  326. # log("specs is_similar")
  327. if check_specs(c_specs,ots_name):
  328. break_flag = True
  329. new_specs = ots_name
  330. log("check_specs %s succeed %s"%(specs,new_specs))
  331. # to update the document_product_dict which is builded for search
  332. if brand_ots_id is not None:
  333. # judge if the specs which parent_id is brand_ots_id exists,insert one if not exists else update alias
  334. specs_ots_id = get_document_product_dict_id(brand_ots_id,new_specs)
  335. _d_specs = {DOCUMENT_PRODUCT_DICT_ID:specs_ots_id,
  336. DOCUMENT_PRODUCT_DICT_NAME:new_specs,
  337. DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(specs).lower()),
  338. DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
  339. DOCUMENT_PRODUCT_DICT_STATUS:1,
  340. DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
  341. DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
  342. DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  343. DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  344. }
  345. _dpd_specs = Document_product_dict(_d_specs)
  346. # _dpd_specs.updateAlias(str(new_specs).lower())
  347. if not _dpd_specs.exists_row(self.ots_client):
  348. _dpd_specs.update_row(self.ots_client)
  349. # user interface to add
  350. else:
  351. pass
  352. # #update alias
  353. # _dpd = Document_product_dict({DOCUMENT_PRODUCT_DICT_ID:specs_ots_id})
  354. # _flag = _dpd.fix_columns(self.ots_client,[DOCUMENT_PRODUCT_DICT_ALIAS],True)
  355. # if _flag:
  356. # if _dpd.updateAlias(specs):
  357. # _dpd.update_row(self.ots_client)
  358. break
  359. else:
  360. if _index == 1:
  361. similar_flag = True
  362. # add new specs?
  363. debug("specs not similar")
  364. if is_legal_specs(specs) and len(specs)<MAX_NAME_LENGTH and len(specs)>=5:
  365. debug("is_legal_specs")
  366. new_specs = clean_product_specs(specs)
  367. # insert into document_product_dict a new record
  368. # to update the document_product_dict which is builded for search
  369. # add new specs
  370. if brand_ots_id is not None and name_ots_id is not None:
  371. _md5 = get_document_product_dict_id(brand_ots_id,new_specs)
  372. # _d = {DOCUMENT_PRODUCT_DICT_ID:_md5,
  373. # DOCUMENT_PRODUCT_DICT_NAME:new_specs,
  374. # DOCUMENT_PRODUCT_DICT_ALIAS:"%s&&%s"%(specs,new_specs),
  375. # DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
  376. # DOCUMENT_PRODUCT_DICT_STATUS:1,
  377. # DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
  378. # DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  379. # DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  380. # }
  381. # _dpd = Document_product_dict(_d)
  382. # _dpd.update_row(self.ots_client)
  383. log("adding new specs %s"%(new_specs))
  384. # user interface to add
  385. _d = {DOCUMENT_PRODUCT_DICT_INTERFACE_ID:uuid4().hex,
  386. DOCUMENT_PRODUCT_DICT_INTERFACE_NAME:new_specs,
  387. DOCUMENT_PRODUCT_DICT_INTERFACE_ALIAS:"%s"%(new_specs.lower()),
  388. DOCUMENT_PRODUCT_DICT_INTERFACE_GRADE:SPECS_GRADE,
  389. DOCUMENT_PRODUCT_DICT_INTERFACE_STATUS:1,
  390. DOCUMENT_PRODUCT_DICT_INTERFACE_PARENT_ID:brand_ots_id,
  391. DOCUMENT_PRODUCT_DICT_INTERFACE_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  392. DOCUMENT_PRODUCT_DICT_INTERFACE_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  393. DOCUMENT_PRODUCT_DICT_INTERFACE_ACTION:"insert"
  394. }
  395. _dpdi = Document_product_dict_interface(_d)
  396. _dpdi.update_row(self.ots_client)
  397. if specs_ots_id is None:
  398. _find = False
  399. for specs in list_candidates:
  400. if _find:
  401. break
  402. debug("getting sepcs %s"%(specs))
  403. list_specs = []
  404. c_specs = clean_product_specs(specs)
  405. list_specs.append(c_specs)
  406. for s in re.split("[\u4e00-\u9fff]",specs):
  407. if s!="" and len(s)>4:
  408. list_specs.append(s)
  409. similar_flag = None
  410. _index = 0
  411. for c_specs in list_specs:
  412. if _find:
  413. break
  414. _index += 1
  415. specs_vector = get_embedding_request(c_specs)
  416. if specs_vector is not None:
  417. Coll,_ = self.get_collection(SPECS_GRADE)
  418. search_list = get_embedding_search(Coll,embedding_index_name,c_specs,SPECS_GRADE,[specs_vector],self.search_params,output_fields,limit=20)
  419. for _search in search_list:
  420. if _find:
  421. break
  422. ots_id = _search.get("standard_name_id")
  423. ots_name = _search.get("standard_name")
  424. ots_parent_id = _search.get("ots_parent_id")
  425. debug("checking specs %s and %s"%(specs,ots_name))
  426. if is_similar(specs,ots_name):
  427. # log("specs is_similar")
  428. if check_specs(c_specs,ots_name):
  429. break_flag = True
  430. new_specs = ots_name
  431. if brand_ots_id is not None:
  432. # judge if the specs which parent_id is brand_ots_id exists,insert one if not exists else update alias
  433. specs_ots_id = get_document_product_dict_id(brand_ots_id,new_specs)
  434. _d_specs = {DOCUMENT_PRODUCT_DICT_ID:specs_ots_id,
  435. DOCUMENT_PRODUCT_DICT_NAME:new_specs,
  436. DOCUMENT_PRODUCT_DICT_ALIAS:"%s"%(str(specs).lower()),
  437. DOCUMENT_PRODUCT_DICT_GRADE:SPECS_GRADE,
  438. DOCUMENT_PRODUCT_DICT_STATUS:1,
  439. DOCUMENT_PRODUCT_DICT_PARENT_ID:brand_ots_id,
  440. DOCUMENT_PRODUCT_DICT_IS_SYNCHONIZED:IS_SYNCHONIZED,
  441. DOCUMENT_PRODUCT_DICT_CREATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  442. DOCUMENT_PRODUCT_DICT_UPDATE_TIME:getCurrent_date(format="%Y-%m-%d %H:%M:%S"),
  443. }
  444. _dpd_specs = Document_product_dict(_d_specs)
  445. # _dpd_specs.updateAlias(str(new_specs).lower())
  446. if not _dpd_specs.exists_row(self.ots_client):
  447. _dpd_specs.update_row(self.ots_client)
  448. _find = True
  449. break
  450. # judge if the product matches the standard product
  451. if name_ots_id is not None:
  452. is_legal_data = True
  453. #standard the product and same to document_product table
  454. _product = Document_product(tmp_dict)
  455. docid = _product.getProperties().get(DOCUMENT_PRODUCT_DOCID)
  456. unit_price = _product.getProperties().get(DOCUMENT_PRODUCT_UNIT_PRICE)
  457. quantity = _product.getProperties().get(DOCUMENT_PRODUCT_QUANTITY)
  458. unit_price = clean_product_unit_price(unit_price)
  459. quantity = clean_product_quantity(quantity)
  460. total_price = _product.getProperties().get(DOCUMENT_PRODUCT_TOTAL_PRICE)
  461. _product.setValue(DOCUMENT_PRODUCT_UNIT_PRICE,unit_price,True)
  462. _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
  463. win_bid_price = _product.getProperties().get(DOCUMENT_PRODUCT_WIN_BID_PRICE)
  464. if isinstance(unit_price,(float,int)) and isinstance(quantity,(float,int)) and isinstance(total_price,(float,int)):
  465. if unit_price>0:
  466. new_quantity = total_price/unit_price
  467. if new_quantity!=quantity:
  468. if new_quantity==total_price//unit_price:
  469. quantity = int(new_quantity)
  470. _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
  471. else:
  472. is_legal_data = False
  473. elif quantity>0:
  474. unit_price = total_price/quantity
  475. _product.setValue(DOCUMENT_PRODUCT_UNIT_PRICE,unit_price,True)
  476. elif isinstance(unit_price,(float,int)) and isinstance(quantity,(float,int)):
  477. total_price = float("%.2f"%(unit_price*quantity))
  478. _product.setValue(DOCUMENT_PRODUCT_TOTAL_PRICE,total_price,True)
  479. elif isinstance(unit_price,(float,int)) and isinstance(total_price,(float,int)):
  480. if unit_price>0:
  481. quantity = int(total_price//unit_price)
  482. _product.setValue(DOCUMENT_PRODUCT_QUANTITY,quantity,True)
  483. elif isinstance(quantity,(float,int)) and isinstance(total_price,(float,int)):
  484. if quantity>0:
  485. unit_price = float("%.2f"%(total_price/quantity))
  486. _product.setValue(DOCUMENT_PRODUCT_UNIT_PRICE,unit_price,True)
  487. elif isinstance(quantity,(float,int)) and quantity>10000:
  488. is_legal_data = False
  489. if isinstance(_product.getProperties().get(DOCUMENT_PRODUCT_TOTAL_PRICE),(float,int)) and isinstance(win_bid_price,(float,int)):
  490. if _product.getProperties().get(DOCUMENT_PRODUCT_TOTAL_PRICE)>win_bid_price*10 and win_bid_price>0:
  491. is_legal_data = False
  492. if isinstance(_product.getProperties().get(DOCUMENT_PRODUCT_UNIT_PRICE),(float,int)) and _product.getProperties().get(DOCUMENT_PRODUCT_UNIT_PRICE)>100000000:
  493. is_legal_data = False
  494. new_id = self.get_product_id(docid,new_name,new_brand,new_specs,unit_price,quantity)
  495. _product.setValue(DOCUMENT_PRODUCT_ID,new_id,True)
  496. _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_ID,tmp_dict.get(DOCUMENT_PRODUCT_TMP_ID),True)
  497. if name_ots_id is not None:
  498. _product.setValue(DOCUMENT_PRODUCT_DICT_NAME_ID,name_ots_id,True)
  499. if brand_ots_id is not None:
  500. _product.setValue(DOCUMENT_PRODUCT_DICT_BRAND_ID,brand_ots_id,True)
  501. if specs_ots_id is not None:
  502. _product.setValue(DOCUMENT_PRODUCT_DICT_SPECS_ID,specs_ots_id,True)
  503. _product.setValue(DOCUMENT_PRODUCT_NAME,new_name,True)
  504. _product.setValue(DOCUMENT_PRODUCT_BRAND,new_brand,True)
  505. _product.setValue(DOCUMENT_PRODUCT_SPECS,new_specs,True)
  506. _product.setValue(DOCUMENT_PRODUCT_STATUS,randint(201,300),True)
  507. _product.setValue(DOCUMENT_PRODUCT_BRANDSPECS,"%s&&%s"%(new_brand,new_specs),True)
  508. _product.setValue(DOCUMENT_PRODUCT_FULL_NAME,"%s&&%s&&%s"%(new_name,new_brand,new_specs),True)
  509. _product.setValue(DOCUMENT_PRODUCT_CREATE_TIME,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
  510. _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,""),True)
  511. _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,""),True)
  512. _product.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,document_product_tmp.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,""),True)
  513. bid_filemd5s = self.get_bid_filemd5s(docid,self.ots_client)
  514. if bid_filemd5s is not None:
  515. _product.setValue(DOCUMENT_PRODUCT_BID_FILEMD5S,bid_filemd5s,True)
  516. if not is_legal_data:
  517. _status = randint(501,550)
  518. elif self.dumplicate(_product):
  519. _status = randint(201,300)
  520. save_product_tmp.setValue(DOCUMENT_PRODUCT_TMP_NEW_ID,new_id,True)
  521. _product.update_row(self.ots_client)
  522. else:
  523. _status = randint(451,500)
  524. else:
  525. _status = randint(401,450)
  526. save_product_tmp.setValue(DOCUMENT_PRODUCT_TMP_STATUS,_status,True)
  527. save_product_tmp.update_row(self.ots_client)
  528. def check_new_brand(self,brand):
  529. return is_legal_brand(self.ots_client,brand)
  530. @staticmethod
  531. def get_bid_filemd5s(docid,ots_client):
  532. bool_query = BoolQuery(must_queries=[
  533. TermQuery("docids",docid)
  534. ])
  535. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  536. SearchQuery(bool_query,limit=10),
  537. columns_to_get=ColumnsToGet(["docids"],return_type=ColumnReturnType.SPECIFIED))
  538. list_data = getRow_ots(rows)
  539. list_bid_filemd5s = []
  540. set_docids = set([docid])
  541. set_md5s = set()
  542. for _d in list_data:
  543. try:
  544. docids = _d.get("docids","")
  545. for _id in docids.split(","):
  546. set_docids.add(int(_id))
  547. except Exception as e:
  548. pass
  549. list_docids = list(set_docids)
  550. for _docid in list_docids:
  551. _d = {document_partitionkey:_docid%500+1,
  552. document_docid:_docid}
  553. _doc = Document(_d)
  554. _doc.fix_columns(ots_client,[document_attachment_path],True)
  555. page_attachments = _doc.getProperties().get(document_attachment_path)
  556. if page_attachments is not None and page_attachments!="":
  557. attachments = json.loads(page_attachments)
  558. for _a in attachments:
  559. _filemd5 = _a.get(document_attachment_path_filemd5)
  560. if _filemd5 in set_md5s or _filemd5 is None:
  561. continue
  562. set_md5s.add(_filemd5)
  563. _da = {attachment_filemd5:_filemd5}
  564. _attach = attachment(_da)
  565. _attach.fix_columns(ots_client,[attachment_classification],True)
  566. if _attach.getProperties().get(attachment_classification,"")=="招标文件":
  567. list_bid_filemd5s.append(_filemd5)
  568. if len(list_bid_filemd5s)==0:
  569. return None
  570. return ",".join(list(set(list_bid_filemd5s)))
  571. def get_value_count(self,name,brand,specs,unit_price,quantity):
  572. value_count = 0
  573. if len(name)>0:
  574. value_count += 1
  575. if len(brand)>0:
  576. value_count += 1
  577. if len(specs)>0:
  578. value_count += 1
  579. if isinstance(unit_price,(float,int)) and unit_price>0:
  580. value_count += 1
  581. if isinstance(quantity,(float,int)) and quantity>0:
  582. value_count += 1
  583. return value_count
  584. def dumplicate_search_product(self,document_product):
  585. docid = document_product.getProperties().get(DOCUMENT_PRODUCT_DOCID)
  586. name = str(document_product.getProperties().get(DOCUMENT_PRODUCT_NAME,""))
  587. brand = str(document_product.getProperties().get(DOCUMENT_PRODUCT_BRAND,""))
  588. specs = str(document_product.getProperties().get(DOCUMENT_PRODUCT_SPECS,""))
  589. unit_price = document_product.getProperties().get(DOCUMENT_PRODUCT_UNIT_PRICE,"")
  590. quantity = document_product.getProperties().get(DOCUMENT_PRODUCT_QUANTITY,"")
  591. page_time = document_product.getProperties().get(DOCUMENT_PRODUCT_PAGE_TIME)
  592. tenderee = str(document_product.getProperties().get(DOCUMENT_PRODUCT_TENDEREE,""))
  593. supplier = str(document_product.getProperties().get(DOCUMENT_PRODUCT_SUPPLIER,""))
  594. page_time_before = page_time
  595. page_time_after = page_time
  596. try:
  597. page_time_before = timeAdd(page_time,-30,format="%Y-%m-%d",)
  598. page_time_after = timeAdd(page_time,30)
  599. except Exception as e:
  600. pass
  601. if len(name)>0 and len(brand)>0 and len(specs)>0 and isinstance(unit_price,(float,int)) and isinstance(quantity,(float,int)):
  602. bool_query = BoolQuery(must_queries=[TermQuery("name",name),
  603. RangeQuery("page_time",page_time_before,page_time_after,True,True),
  604. TermQuery("brand",brand),
  605. TermQuery("specs",specs),
  606. TermQuery("unit_price",unit_price),
  607. TermQuery("quantity",quantity)
  608. ])
  609. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product","document_product_index",
  610. SearchQuery(bool_query,limit=1),
  611. columns_to_get=ColumnsToGet(["name",'brand','specs'],return_type=ColumnReturnType.SPECIFIED))
  612. list_data = getRow_ots(rows)
  613. if len(list_data)>0:
  614. return list_data[0].get(DOCUMENT_PRODUCT_ID),1
  615. if len(name)>0 and len(brand)>0 and len(supplier)>0 and len(tenderee)>0:
  616. # log("docid %s name %s page_time_before %s page_time_after %s brand %s supplier %s tenderee %s"%(str(docid),name,page_time_before,page_time_after,brand,supplier,tenderee))
  617. bool_query = BoolQuery(must_queries=[TermQuery("name",name),
  618. RangeQuery("page_time",page_time_before,page_time_after,True,True),
  619. TermQuery(DOCUMENT_PRODUCT_BRAND,brand),
  620. TermQuery(DOCUMENT_PRODUCT_TENDEREE,tenderee),
  621. TermQuery(DOCUMENT_PRODUCT_SUPPLIER,supplier),
  622. ])
  623. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_product","document_product_index",
  624. SearchQuery(bool_query,limit=50),
  625. columns_to_get=ColumnsToGet(['name','brand','specs','unit_price','quantity'],return_type=ColumnReturnType.SPECIFIED))
  626. list_data = getRow_ots(rows)
  627. value_count = self.get_value_count(name,brand,specs,unit_price,quantity)
  628. for _d in list_data:
  629. s_id = _d.get(DOCUMENT_PRODUCT_ID)
  630. s_name = _d.get(DOCUMENT_PRODUCT_NAME,"")
  631. s_brand = _d.get(DOCUMENT_PRODUCT_BRAND,"")
  632. s_specs = _d.get(DOCUMENT_PRODUCT_SPECS,"")
  633. s_unit_price = _d.get(DOCUMENT_PRODUCT_UNIT_PRICE,"")
  634. s_quantity = _d.get(DOCUMENT_PRODUCT_QUANTITY,"")
  635. check_flag = True
  636. value_count1 = self.get_value_count(s_name,s_brand,s_specs,s_unit_price,s_quantity)
  637. if len(specs)>0 and len(s_specs)>0 and specs!=s_specs:
  638. check_flag = False
  639. elif isinstance(unit_price,(float,int)) and isinstance(s_unit_price,(float,int)) and unit_price!=s_unit_price:
  640. check_flag = False
  641. elif isinstance(quantity,(float,int)) and isinstance(s_quantity,(float,int)) and quantity!=s_quantity:
  642. check_flag = False
  643. if check_flag:
  644. if value_count<value_count1:
  645. to_save = 0
  646. else:
  647. to_save = 1
  648. return s_id,to_save
  649. return None,1
  650. def dumplicate(self,document_product):
  651. '''
  652. Duplicates the product data
  653. 将同一个产品的采购结果公示进行去重,结合公告进行。
  654. :return:True if not repeated else False
  655. '''
  656. dump_id,to_save = self.dumplicate_search_product(document_product)
  657. if dump_id is not None:
  658. document_product.setValue(DOCUMENT_PRODUCT_DUMP_ID,dump_id,True)
  659. if to_save==1:
  660. if dump_id is not None:
  661. _d = {DOCUMENT_PRODUCT_ID:dump_id,
  662. DOCUMENT_PRODUCT_STATUS:randint(401,450),
  663. DOCUMENT_PRODUCT_DUMP_ID:document_product.getProperties().get(DOCUMENT_PRODUCT_ID)}
  664. _dp = Document_product(_d)
  665. _dp.update_row(self.ots_client)
  666. return True
  667. else:
  668. return False
  669. def start_processing(self):
  670. scheduler = BlockingScheduler()
  671. scheduler.add_job(self.producer,"cron",second="*/20")
  672. scheduler.add_job(self.comsumer,"cron",minute="*/1")
  673. scheduler.add_job(self.embedding_comsumer,"cron",minute="*/1")
  674. scheduler.add_job(self.embedding_interface_comsumer,"cron",second="*/20")
  675. scheduler.start()
  676. def test(self):
  677. from BaseDataMaintenance.common.sentencesUtil import cosine_similarity
  678. import torch
  679. output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]
  680. id = '56bdad168c71a1fc4d57cd10bcd987f0'
  681. collection,_ = self.get_collection(SPECS_GRADE)
  682. vector = request_embedding("西门子MAGNETOMLumina")
  683. vector1 = request_embedding("西门子")
  684. print("cosine similarity",cosine_similarity(torch.from_numpy(np.array([vector])) ,torch.from_numpy(np.array([vector1]))))
  685. Coll,_ = self.get_collection(SPECS_GRADE)
  686. search_list = search_embedding(Coll,embedding_index_name,[vector],self.search_params,output_fields,limit=60)
  687. for p in search_list:
  688. print(p)
  689. #
  690. # res = collection.query(
  691. # expr = "ots_id in ['%s']"%(id),
  692. # offset = 0,
  693. # limit = 10,
  694. # output_fields = output_fields,
  695. # consistency_level="Strong"
  696. # )
  697. # print(res)
  698. def start_process_product():
  699. pm = Product_Manager()
  700. pm.start_processing()
  701. def fix_product_data():
  702. '''
  703. # delete document_product and change the record status to 1 in document_product_temp which id=original id
  704. :return:
  705. '''
  706. table_name = "document_product_temp"
  707. table_index = "document_product_temp_index"
  708. columns = [DOCUMENT_PRODUCT_TMP_WIN_BID_PRICE]
  709. ots_client = getConnect_ots()
  710. bool_query = BoolQuery(must_queries=[
  711. RangeQuery("status",501),
  712. # TermQuery("docid",246032980)
  713. ])
  714. rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
  715. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status")]),limit=100,get_total_count=True),
  716. columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  717. list_rows = getRow_ots(rows)
  718. while next_token:
  719. rows,next_token,total_count,is_all_succeed = ots_client.search(table_name,table_index,
  720. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  721. columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  722. list_rows.extend(getRow_ots(rows))
  723. print("%d/%d"%(len(list_rows),total_count))
  724. # if len(list_rows)>10000:
  725. # break
  726. task_queue = Queue()
  727. for d in list_rows:
  728. task_queue.put(d)
  729. def fix_missing_data(item,result_queue):
  730. original_id = item.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
  731. _d = {DOCUMENT_PRODUCT_TMP_ID:original_id,DOCUMENT_PRODUCT_TMP_STATUS:1}
  732. dpt = Document_product_tmp(_d)
  733. dpt.fix_columns(ots_client,["name","brand","specs"],True)
  734. _d = {DOCUMENT_PRODUCT_ID:item.get(DOCUMENT_PRODUCT_ID)}
  735. dp = Document_product(_d)
  736. #fix the project_code and original_name and bidi_filemd5s
  737. docid = int(item.get(DOCUMENT_PRODUCT_DOCID))
  738. partitionkey = docid%500+1
  739. # project_name = item.get(DOCUMENT_PRODUCT_PROJECT_NAME,"")
  740. # if project_name=="":
  741. # #fix project_name
  742. # _doc = Document({"partitionkey":partitionkey,
  743. # "docid":docid})
  744. # _doc.fix_columns(ots_client,["doctitle"],True)
  745. # dp.setValue(DOCUMENT_PRODUCT_DOCTITLE,_doc.getProperties().get("doctitle"),True)
  746. bid_filemd5s = Product_Manager.get_bid_filemd5s(docid,ots_client)
  747. if bid_filemd5s is not None:
  748. dp.setValue(DOCUMENT_PRODUCT_BID_FILEMD5S,bid_filemd5s,True)
  749. dp.setValue(DOCUMENT_PRODUCT_ORIGINAL_NAME,dpt.getProperties().get(DOCUMENT_PRODUCT_TMP_NAME,""),True)
  750. dp.setValue(DOCUMENT_PRODUCT_ORIGINAL_BRAND,dpt.getProperties().get(DOCUMENT_PRODUCT_TMP_BRAND,""),True)
  751. dp.setValue(DOCUMENT_PRODUCT_ORIGINAL_SPECS,dpt.getProperties().get(DOCUMENT_PRODUCT_TMP_SPECS,""),True)
  752. dp.update_row(ots_client)
  753. def deleteAndReprocess(item,result_queue):
  754. original_id = item.get(DOCUMENT_PRODUCT_ORIGINAL_ID)
  755. # delete data and rerun
  756. _d = {DOCUMENT_PRODUCT_TMP_ID:original_id,DOCUMENT_PRODUCT_TMP_STATUS:1}
  757. dpt = Document_product_tmp(_d)
  758. dpt.update_row(ots_client)
  759. _d = {DOCUMENT_PRODUCT_ID:item.get(DOCUMENT_PRODUCT_ID)}
  760. dp = Document_product(_d)
  761. dp.delete_row(ots_client)
  762. def handle(item,result_queue):
  763. win_bid_price = item.get(DOCUMENT_PRODUCT_TMP_WIN_BID_PRICE,1)
  764. if win_bid_price==0:
  765. dpt = Document_product_tmp(item)
  766. dpt.setValue(DOCUMENT_PRODUCT_TMP_STATUS,1,True)
  767. dpt.update_row(ots_client)
  768. mt = MultiThreadHandler(task_queue,handle,None,30,1)
  769. mt.run()
  770. def test_check_brand():
  771. import logging
  772. root = logging.getLogger()
  773. root.setLevel(logging.DEBUG)
  774. from queue import Queue
  775. brand_path = "brand.txt"
  776. list_brand = []
  777. with open(brand_path,"r",encoding="utf8") as f:
  778. while 1:
  779. line = f.readline()
  780. if not line:
  781. break
  782. line = line.strip()
  783. if len(line)>0:
  784. brand = {"brand":line}
  785. list_brand.append(brand)
  786. # if len(list_brand)>100:
  787. # break
  788. task_queue = Queue()
  789. for _d in list_brand:
  790. task_queue.put(_d)
  791. pm = Product_Manager()
  792. def _handle(item,result_queue):
  793. brand = item.get("brand")
  794. new_brand = clean_product_brand(brand)
  795. _f = pm.check_new_brand(brand)
  796. item["f"] = _f
  797. item["new_brand"] = new_brand
  798. mt = MultiThreadHandler(task_queue,_handle,None,30,1)
  799. mt.run()
  800. list_legal_brand = []
  801. list_illegal_brand = []
  802. for _d in list_brand:
  803. f = _d.get("f")
  804. log("brand %s flag %s"%(brand,str(f)))
  805. if f:
  806. brand = _d.get("new_brand")
  807. list_legal_brand.append(brand)
  808. else:
  809. brand = _d.get("brand")
  810. list_illegal_brand.append(brand)
  811. with open("legal_brand.txt","w",encoding="utf8") as f:
  812. for b in list_legal_brand:
  813. f.write(b+"\n")
  814. with open("illegal_brand.txt","w",encoding="utf8") as f:
  815. for b in list_illegal_brand:
  816. f.write(b+"\n")
  817. def test_match():
  818. a = "Mini-7"
  819. vector = request_embedding(a)
  820. pm = Product_Manager()
  821. Coll,_ = pm.get_collection(NAME_GRADE)
  822. output_fields = ['ots_id','ots_name',"ots_parent_id","standard_name","standard_name_id"]
  823. search_list = search_embedding(Coll,embedding_index_name,[vector],pm.search_params,output_fields,limit=60)
  824. print(search_list)
  825. def test():
  826. # pm = Product_Manager()
  827. # pm.test()
  828. fix_product_data()
  829. # test_check_brand()
  830. # test_match()
  831. if __name__ == '__main__':
  832. # start_process_product()
  833. # print(getMD5('11936c56f2dd1426764e317ca2e8e1a7'+'&&鱼跃'))
  834. test()
  835. print(Product_Manager.get_bid_filemd5s(155415770,getConnect_ots()))
  836. name = "一"
  837. ots_name = "一氧化碳分析仪"
  838. print(is_similar(name,ots_name),check_product(name,ots_name))
  839. print(is_legal_specs('SCM-A/SB(0.18D)'))