compare_utils.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. import json
  2. import re
  3. from decimal import Decimal
  4. from fuzzywuzzy import fuzz
  5. def getDigitsDic(unit):
  6. '''
  7. @summary:拿到中文对应的数字
  8. '''
  9. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  10. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  11. return DigitsDic.get(unit)
  12. def getMultipleFactor(unit):
  13. '''
  14. @summary:拿到单位对应的值
  15. '''
  16. MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
  17. return MultipleFactor.get(unit)
  18. def getUnifyMoney(money):
  19. '''
  20. @summary:将中文金额字符串转换为数字金额
  21. @param:
  22. money:中文金额字符串
  23. @return: decimal,数据金额
  24. '''
  25. if money in [None, '', '-']:
  26. return Decimal(0)
  27. money = str(money)
  28. MAX_MONEY = 1000000000000
  29. MAX_NUM = 12
  30. #去掉逗号
  31. money = re.sub("[,,]","",money)
  32. money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
  33. result = Decimal(0)
  34. chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
  35. # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
  36. chnFactorUnits = ["兆", "亿", "万", "仟", '千', "佰", '百', "拾", '十',"圆", "元", "角", "分"] # 20240611 修复大写提取错误 '陆拾陆亿伍千柒佰零叁万肆千叁佰陆拾伍元' Decimal('11607430365')
  37. LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
  38. BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
  39. try:
  40. if re.search(LowMoneypattern,money) is not None:
  41. return Decimal(money)
  42. elif re.search(BigMoneypattern,money) is not None:
  43. return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
  44. for factorUnit in chnFactorUnits:
  45. if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
  46. subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
  47. if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None:
  48. if MAX_MONEY/getMultipleFactor(factorUnit)<Decimal(subMoneys[0]):
  49. return Decimal(0)
  50. result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
  51. elif len(subMoneys[0])==1:
  52. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
  53. result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
  54. # subMoneys[0]中无金额单位,不可再拆分
  55. elif subMoneys[0]=="":
  56. result += 0
  57. elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
  58. # print(subMoneys)
  59. # subMoneys[0] = subMoneys[0][0]
  60. result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
  61. else:
  62. result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
  63. if len(subMoneys)>1:
  64. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
  65. result += Decimal(subMoneys[1])
  66. elif len(subMoneys[1])==1:
  67. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
  68. result += Decimal(getDigitsDic(subMoneys[1]))
  69. else:
  70. result += Decimal(getUnifyMoney(subMoneys[1]))
  71. break
  72. except Exception as e:
  73. # traceback.print_exc()
  74. return Decimal(0)
  75. return result
  76. def text_sim(s1, s2):
  77. s1 = str(s1 or "").strip()
  78. s2 = str(s2 or "").strip()
  79. if not s1 and not s2:
  80. return 1.0
  81. if not s1 or not s2:
  82. return 0.0
  83. return fuzz.token_sort_ratio(s1, s2) / 100
  84. def num_sim(n1, n2):
  85. try:
  86. a = float(n1)
  87. b = float(n2)
  88. except:
  89. return 0.0
  90. diff = abs(a - b)
  91. return 1.0 - diff / max(a, b, 0.01)
  92. def product_similarity(p1, p2):
  93. score = 0.0
  94. # score += text_sim(p1.get("产品名称"), p2.get("产品名称")) * 0.4
  95. # score += text_sim(p1.get("规格型号"), p2.get("规格型号")) * 0.3
  96. # score += text_sim(p1.get("品牌"), p2.get("品牌")) * 0.1
  97. # score += num_sim(p1.get("单价"), p2.get("单价")) * 0.08
  98. # score += num_sim(p1.get("数量"), p2.get("数量")) * 0.06
  99. # score += num_sim(p1.get("总价"), p2.get("总价")) * 0.06
  100. score += text_sim(p1.get("产品名称"), p2.get("产品名称")) * 0.125
  101. score += text_sim(p1.get("规格型号"), p2.get("规格型号")) * 0.125
  102. score += text_sim(p1.get("品牌"), p2.get("品牌")) * 0.125
  103. score += num_sim(p1.get("单价"), p2.get("单价")) * 0.125
  104. score += num_sim(p1.get("数量"), p2.get("数量")) * 0.125
  105. score += num_sim(p1.get("总价"), p2.get("总价")) * 0.125
  106. score += text_sim(p1.get("品目编号"), p2.get("品目编号")) * 0.125
  107. score += text_sim(p1.get("品目名称"), p2.get("品目名称")) * 0.125
  108. return score
  109. def calculate_matching_ratio(list_a, list_b, threshold=0.6):
  110. if list_a is None or list_b is None:
  111. return 0
  112. used_b = [False] * len(list_b)
  113. match_count = 0
  114. for a in list_a:
  115. best_score = 0
  116. best_idx = -1
  117. for i, b in enumerate(list_b):
  118. if used_b[i]:
  119. continue
  120. score = product_similarity(a, b)
  121. if score > best_score:
  122. best_score = score
  123. best_idx = i
  124. if best_score >= threshold:
  125. match_count += 1
  126. if best_idx != -1:
  127. used_b[best_idx] = True
  128. total = max(len(list_a), len(list_b))
  129. return match_count / total if total != 0 else 1.0
  130. def calculate_complete_match_ratio(list1, list2):
  131. if list1 is None or list2 is None:
  132. return 0
  133. example_d = {"产品名称": "树脂等物资", "单价": "未公开", "数量": "未公开", "数量单位": "未公开", "总价": "未公开",
  134. "品牌": "未公开", "规格型号": "未公开", "品目编号": "未公开", "品目名称": "未公开"}
  135. cols = list(example_d.keys())
  136. str_list1 = []
  137. for d in list1:
  138. str1 = '@'.join([str(d.get(x, '')) for x in cols])
  139. str_list1.append(str1)
  140. str_list2 = []
  141. for d in list2:
  142. str2 = '@'.join([str(d.get(x, '')) for x in cols])
  143. str_list2.append(str2)
  144. match_cnt = 0
  145. for str1 in str_list1:
  146. if str1 in str_list2:
  147. match_cnt += 1
  148. all_cnt = len(list(set(str_list1) | set(str_list2)))
  149. ratio = round(match_cnt / all_cnt, 2)
  150. return ratio
  151. def calculate_cnt_ratio(list1, list2):
  152. if list1 is None:
  153. list1 = []
  154. if list2 is None:
  155. list2 = []
  156. if len(list1) == len(list2):
  157. return 1
  158. else:
  159. return 0
  160. def compare_products(product_list1, product_list2):
  161. product_list1 = json.dumps(product_list1, ensure_ascii=False)
  162. product_list2 = json.dumps(product_list2, ensure_ascii=False)
  163. product_list1 = re.sub('未公开', '', product_list1)
  164. product_list2 = re.sub('未公开', '', product_list2)
  165. product_list1 = json.loads(product_list1)
  166. product_list2 = json.loads(product_list2)
  167. money_cols = ['单价', '总价']
  168. if product_list1:
  169. for d in product_list1:
  170. for col in money_cols:
  171. d[col] = getUnifyMoney(d.get(col))
  172. if product_list2:
  173. for d in product_list2:
  174. for col in money_cols:
  175. d[col] = getUnifyMoney(d.get(col))
  176. weight_score = calculate_matching_ratio(product_list1, product_list2)
  177. complete_score = calculate_complete_match_ratio(product_list1, product_list2)
  178. cnt_score = calculate_cnt_ratio(product_list1, product_list2)
  179. return weight_score, complete_score, cnt_score