re_ratio.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. import re
  2. from decimal import Decimal
  3. # ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
  4. # ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[))]?[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
  5. ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率|优惠率)([((]?[%‰][))]?|)(报价|取值|)([((].{1,20}[))])?[))]?[为是:: ,]{0,3}'
  6. '([0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰]?[))]?|[百千]分之[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+(?:点[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+)?)'
  7. '|[0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰][))]?[((]?[\u4e00-\u9fa5]{,2}(?:费率|折扣率|优惠率|(上浮|下浮)费?率)[))]?)')
  8. ratio = ratio.pattern
  9. # print(ratio)
  10. # 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%, X
  11. # 基准利率上浮率:百分之三十(30%) X
  12. # 租金上浮率
  13. # 上浮率活期20%
  14. # 上浮率:活期20%、一年定期35%
  15. # 下浮率报价0.5%
  16. def re_standard_ratio(_str):
  17. reg_standard = "(?P<value>" + ratio + ")"
  18. match = re.finditer(reg_standard, _str)
  19. ratio_list = []
  20. if match:
  21. for m in match:
  22. m_dict = m.groupdict()
  23. m_span = m.span()
  24. keyword_index = [m_span[0], m_span[1]]
  25. keyword = m_dict.get("value")
  26. left = _str[max(0,m_span[0]-15):m_span[0]]
  27. right = _str[m_span[1]:m_span[1]+10]
  28. context = left + keyword + right
  29. # print(1,keyword)
  30. if not re.search("利率|保险",context) and not re.search("^[万元]",right):
  31. ratio_list.append([keyword, keyword_index])
  32. return ratio_list
  33. def re_ratio(text):
  34. # 查找符合标准形式的 总价
  35. ratio_list = re_standard_ratio(text)
  36. return ratio_list
  37. def extract_ratio(text):
  38. result_list = []
  39. total_money_list = re_ratio(text)
  40. # print(total_money_list)
  41. if total_money_list:
  42. for word, text_index in total_money_list:
  43. num_value = re.search("\d+(?:\.\d+)?[((]?[%‰]?|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十][零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]*(?:点[零壹贰叁肆伍陆柒捌玖一二三四五六七八九]+)?(?!分之)", word).group()
  44. if re.search("[零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]",num_value):
  45. if '点' in num_value:
  46. num_split = num_value.split("点")
  47. round_len = len(num_split[1])
  48. num_integer = num_split[0]
  49. if re.search("^[十拾佰百]",num_integer):
  50. num_integer = "壹" + num_integer
  51. num_value = getUnifyNum(num_integer)
  52. for index,num_word in enumerate(list(num_split[1])):
  53. num_value = float(num_value) + getDigitsDic(num_word) * 0.1**(index+1)
  54. else:
  55. round_len = 0
  56. num_value = getUnifyNum(num_value)
  57. num_value = float(num_value)
  58. if re.search("%|百分之", word):
  59. num_value = num_value / 100
  60. round_len += 2
  61. elif re.search("‰|千分之", word):
  62. num_value = num_value / 1000
  63. round_len += 3
  64. else:
  65. match_text = num_value
  66. # print(num_value)
  67. num_value = round(Decimal(re.sub('[((]|[%‰]','',num_value)),10)
  68. # print(num_value)
  69. # _num = str(num_value).split('.')[0]
  70. if len(str(num_value).split('.'))<2:
  71. continue
  72. _decimal = str(num_value).split('.')[1]
  73. _decimal = re.sub("0+$","",_decimal)
  74. # print(_decimal)
  75. if _decimal=="":
  76. _decimal = "0"
  77. # num_value = float(_num+"."+_decimal)
  78. # print(num_value)
  79. if _decimal == '0':
  80. round_len = 0
  81. else:
  82. round_len = len(_decimal)
  83. if num_value<1 and not re.search('[%‰]',match_text):
  84. pass
  85. else:
  86. if re.search("%|百分之",word):
  87. num_value = num_value / 100
  88. round_len += 2
  89. elif re.search("‰|千分之",word):
  90. num_value = num_value / 1000
  91. round_len += 3
  92. else:
  93. num_value = num_value / 100
  94. round_len += 2
  95. num_value = round(num_value, round_len)
  96. # print(word,num_value)
  97. if re.search("上浮",word):
  98. ratio_type = 'floating_ratio'
  99. elif re.search("下浮|优惠",word):
  100. ratio_type = 'downward_floating_ratio'
  101. elif re.search("折扣",word):
  102. if num_value>0.5:
  103. ratio_type = 'discount_ratio'
  104. else:
  105. ratio_type = 'downward_floating_ratio'
  106. else:
  107. ratio_type = 'discount_ratio'
  108. if num_value<=1:
  109. d = {"body": word, "begin_index": text_index[0],
  110. "end_index": text_index[1],"value":num_value,"type":ratio_type}
  111. result_list.append(d)
  112. return result_list
  113. def getDigitsDic(unit):
  114. '''
  115. @summary:拿到中文对应的数字
  116. '''
  117. DigitsDic = {"零": 0, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5, "陆": 6, "柒": 7, "捌": 8, "玖": 9,
  118. "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
  119. return DigitsDic.get(unit)
  120. def getMultipleFactor(unit):
  121. '''
  122. @summary:拿到单位对应的值
  123. '''
  124. MultipleFactor = {"兆": Decimal(1000000000000), "亿": Decimal(100000000), "万": Decimal(10000), "仟": Decimal(1000),
  125. "千": Decimal(1000), "佰": Decimal(100), "百": Decimal(100), "拾": Decimal(10), "十": Decimal(10),
  126. "元": Decimal(1), "圆": Decimal(1), "角": round(Decimal(0.1), 1), "分": round(Decimal(0.01), 2)}
  127. return MultipleFactor.get(unit)
  128. def getUnifyNum(money):
  129. '''
  130. @summary:将中文金额字符串转换为数字金额
  131. @param:
  132. money:中文金额字符串
  133. @return: decimal,数据金额
  134. '''
  135. MAX_MONEY = 1000000000000
  136. MAX_NUM = 12
  137. # 去掉逗号
  138. money = re.sub("[,,]", "", money)
  139. money = re.sub("[^0-9.一二三四五六七八九零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", money)
  140. result = Decimal(0)
  141. chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖","一","二","三","四","五","六","七","八","九"]
  142. chnFactorUnits = ["圆", "元", "兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
  143. LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
  144. BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$" % ("".join(chnDigits)))
  145. try:
  146. if re.search(LowMoneypattern, money) is not None:
  147. return Decimal(money)
  148. elif re.search(BigMoneypattern, money) is not None:
  149. return getDigitsDic(re.search(BigMoneypattern, money).group("BigMoney"))
  150. for factorUnit in chnFactorUnits:
  151. if re.search(re.compile(".*%s.*" % (factorUnit)), money) is not None:
  152. subMoneys = re.split(re.compile("%s(?!.*%s.*)" % (factorUnit, factorUnit)), money)
  153. if re.search(re.compile("^(\d+)(\.\d+)?$"), subMoneys[0]) is not None:
  154. if MAX_MONEY / getMultipleFactor(factorUnit) < Decimal(subMoneys[0]):
  155. return Decimal(0)
  156. result += Decimal(subMoneys[0]) * (getMultipleFactor(factorUnit))
  157. elif len(subMoneys[0]) == 1:
  158. if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[0]) is not None:
  159. result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
  160. # subMoneys[0]中无金额单位,不可再拆分
  161. elif subMoneys[0] == "":
  162. result += 0
  163. elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
  164. # print(subMoneys)
  165. # subMoneys[0] = subMoneys[0][0]
  166. result += Decimal(getUnifyNum(subMoneys[0])) * (getMultipleFactor(factorUnit))
  167. else:
  168. result += Decimal(getUnifyNum(subMoneys[0])) * (getMultipleFactor(factorUnit))
  169. if len(subMoneys) > 1:
  170. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"), subMoneys[1]) is not None:
  171. result += Decimal(subMoneys[1])
  172. elif len(subMoneys[1]) == 1:
  173. if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[1]) is not None:
  174. result += Decimal(getDigitsDic(subMoneys[1]))
  175. else:
  176. result += Decimal(getUnifyNum(subMoneys[1]))
  177. break
  178. except Exception as e:
  179. # traceback.print_exc()
  180. return Decimal(0)
  181. return result
  182. def test_str():
  183. s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
  184. s = '年利率较基准利率的上浮率(%): 30 活期存款下浮率:0.455% 协定存的下浮率,(1-下浮率)' \
  185. ' 上浮率.... 上浮率30(%) (下浮率%):43 下浮率报价0.5%'
  186. s = '''费率%)61.20万
  187. 费率(精确到小数点后两位)60.00%
  188. 下浮率取值13%
  189. 下浮率报价13%
  190. 下浮率 百分之十点零陆(10.00%
  191. 下浮率 大写:无 下浮率百分之贰拾陆 无 小写: 下浮26%
  192. 下浮率% 30
  193. 成交优惠率% 5.00
  194. 下浮率 0.25
  195. 下浮率 0.25%
  196. 中标金额:57.75%(商业优惠率)
  197. 费率):1800
  198. 费率):12
  199. 折扣率(%):99.2063
  200. 投标报价:96.00%(折扣率
  201. '''
  202. # s = '下浮率 百分之十点零陆(10.00%'
  203. print(extract_ratio(s))
  204. def test_html():
  205. html_path = "C:/Users/Administrator/Desktop/3.html"
  206. with open(html_path, "r") as f:
  207. s = f.read()
  208. print(extract_ratio(s))
  209. if __name__ == "__main__":
  210. # extract_bidway(s)
  211. # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
  212. test_str()
  213. # test_html(path)
  214. pass