testOne.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. #coding:utf8
  2. import re
  3. import fool
  4. from decimal import *
  5. def getDigitsDic(unit):
  6. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  7. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  8. return DigitsDic.get(unit)
  9. def getMultipleFactor(unit):
  10. MultipleFactor = {"兆":float(1000000000000),"亿":100000000,"万":10000,"仟":1000,"千":1000,"佰":100,"百":100,"拾":10,"十":10,"元":1,"角":0.1,"分":0.01}
  11. return MultipleFactor.get(unit)
  12. def getUnifyMoney(money):
  13. #print(money)
  14. money = re.sub("[,,]","",money)
  15. result = Decimal(0)
  16. chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
  17. chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"]
  18. lcChnDigits = ["〇", "一", "二", "三", "四", "五", "六", "七", "八", "九"]
  19. lcChnFactorUnits = ["兆", "亿", "万", "千", "百", "十","元","角","分"]
  20. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  21. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  22. MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
  23. LowMoneypattern = re.compile("^(\d+,?)+(\.\d+)?$")
  24. BigMoneypattern = re.compile("^[%s]$"%("".join(chnDigits)))
  25. if re.search(LowMoneypattern,money) is not None:
  26. return Decimal(money)
  27. elif re.search(BigMoneypattern,money) is not None:
  28. return DigitsDic.get(money)
  29. for factorUnit in chnFactorUnits:
  30. if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
  31. subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
  32. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None:
  33. result += Decimal(subMoneys[0])*(MultipleFactor.get(factorUnit))
  34. elif len(subMoneys[0])==1:
  35. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
  36. result += Decimal(DigitsDic.get(subMoneys[0]))*(MultipleFactor.get(factorUnit))
  37. else:
  38. result += Decimal(getUnifyMoney(subMoneys[0]))*(MultipleFactor.get(factorUnit))
  39. if len(subMoneys)>1:
  40. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
  41. result += Decimal(subMoneys[1])
  42. elif len(subMoneys[1])==1:
  43. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
  44. result += Decimal(DigitsDic.get(subMoneys[1]))
  45. else:
  46. result += Decimal(getUnifyMoney(subMoneys[1]))
  47. break
  48. return result
  49. print(getUnifyMoney("壹佰肆拾贰万柒仟贰佰伍拾陆元肆角肆分"))
  50. print(Decimal(0.4))
  51. ''''''
  52. text = "金额(万元):2017年"
  53. tokens = fool.cut(text)[0]
  54. entity_type = "money"
  55. list_tokenbegin = []
  56. begin = 0
  57. for i in range(0,len(tokens)):
  58. list_tokenbegin.append(begin)
  59. begin += len(str(tokens[i]))
  60. list_tokenbegin.append(begin+1)
  61. money_patten_str = "(([1-9][\d+,?]*(?:\.\d{1,6})?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[(\(]?([万]?)元[)\)]?[::]?|[¥¥]+,?)([1-9][\d+,?]*(?:\.\d{1,6})?(?:,?)[百千万亿]?))*"
  62. money_patten = re.compile(money_patten_str)
  63. money_patten_all = re.compile("^"+money_patten_str+"$")
  64. all_match = re.findall(money_patten,text)
  65. print(all_match)
  66. print(tokens)
  67. print(list_tokenbegin)
  68. index = 0
  69. for i in range(len(all_match)):
  70. if len(all_match[i][0])>0:
  71. unit = ""
  72. if len(all_match[i][1])>0:
  73. entity_text = all_match[i][1]
  74. elif len(all_match[i][2])>0:
  75. entity_text = all_match[i][2]
  76. else:
  77. print(1)
  78. entity_text = all_match[i][4]
  79. unit = all_match[i][3]
  80. #index += len(all_match[i][0])-len(entity_text)#整个提出来的作为实体
  81. #entity_text = getUnifyMoney(all_match[i])
  82. for j in range(len(list_tokenbegin)):
  83. if list_tokenbegin[j]==index:
  84. begin_index = j
  85. break
  86. elif list_tokenbegin[j]>index:
  87. begin_index = j-1
  88. break
  89. #index += len(str(entity_text))#整个提出来的作为实体
  90. index += len(str(all_match[i][0]))
  91. for j in range(len(list_tokenbegin)):
  92. if list_tokenbegin[j]>=index:
  93. end_index = j-1
  94. break
  95. print(entity_text)
  96. if re.search(money_patten_all,"".join(tokens[begin_index:end_index+1])) is None:
  97. print("not")
  98. if len(unit)>0:
  99. print(0)
  100. entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit))
  101. else:
  102. entity_text = str(getUnifyMoney(entity_text))
  103. print(begin_index,end_index,entity_text)
  104. else:
  105. index += 1