testOne3.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #coding:utf8
  2. import re
  3. import time
  4. import psycopg2
  5. import fool
  6. conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101")
  7. cursor = conn.cursor()
  8. def getDigitsDic(unit):
  9. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  10. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  11. return DigitsDic.get(unit)
  12. def getMultipleFactor(unit):
  13. MultipleFactor = {"兆":float(1000000000000),"亿":100000000,"万":10000,"仟":1000,"千":1000,"佰":100,"百":100,"拾":10,"十":10,"元":1,"角":0.1,"分":0.01}
  14. return MultipleFactor.get(unit)
  15. def getUnifyMoney(money):
  16. #print(money)
  17. money = re.sub("[,,]","",money)
  18. result = 0
  19. chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
  20. chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"]
  21. lcChnDigits = ["〇", "一", "二", "三", "四", "五", "六", "七", "八", "九"]
  22. lcChnFactorUnits = ["兆", "亿", "万", "千", "百", "十","元","角","分"]
  23. DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
  24. "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
  25. MultipleFactor = {"兆":float(1000000000000),"亿":100000000,"万":10000,"仟":1000,"千":1000,"佰":100,"百":100,"拾":10,"十":10,"元":1,"角":0.1,"分":0.01}
  26. LowMoneypattern = re.compile("^(\d+,?)+(\.\d+)?$")
  27. BigMoneypattern = re.compile("^[%s]$"%("".join(chnDigits)))
  28. if re.search(LowMoneypattern,money) is not None:
  29. return float(money)
  30. elif re.search(BigMoneypattern,money) is not None:
  31. return DigitsDic.get(money)
  32. for factorUnit in chnFactorUnits:
  33. if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
  34. subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
  35. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[0]) is not None:
  36. result += float(subMoneys[0])*MultipleFactor.get(factorUnit)
  37. elif len(subMoneys[0])==1:
  38. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
  39. result += DigitsDic.get(subMoneys[0])*MultipleFactor.get(factorUnit)
  40. else:
  41. result += float(getUnifyMoney(subMoneys[0]))*MultipleFactor.get(factorUnit)
  42. if len(subMoneys)>1:
  43. if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
  44. result += float(subMoneys[1])
  45. elif len(subMoneys[1])==1:
  46. if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
  47. result += DigitsDic.get(subMoneys[1])
  48. else:
  49. result += float(getUnifyMoney(subMoneys[1]))
  50. break
  51. return result
  52. ''''''
  53. doc_id = "fdac7fd9-9c74-11e8-b65a-44a84246dbba"
  54. sql = " select tokens,sentence_index from sentences where doc_id='"+doc_id+"' order by sentence_index asc "
  55. cursor.execute(sql)
  56. rows = cursor.fetchall()
  57. for row in rows:
  58. #text = ",中标金额:人民币(万元):5,700,万,陆拾柒万玖仟陆佰伍拾元陆角柒分(¥679,650.00)"
  59. print(row[1])
  60. tokens = row[0]
  61. list_tokenbegin = []
  62. begin = 0
  63. for i in range(0,len(tokens)):
  64. list_tokenbegin.append(begin)
  65. begin += len(str(tokens[i]))
  66. list_tokenbegin.append(begin+1)
  67. #money_patten = re.compile("((?:#sp#)(?:\d+,?)+(?:.\d{2,4})+(?:#sp#)|(?:(?:#sp#)[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{1,})+(?:#sp#))*")
  68. money_patten = re.compile("(([1-9][\d+,]+(?:\.\d{2,4})?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[(\(]?([万]?)元[)\)]?[::]?|[¥¥]+,?)([1-9][\d+,]+(?:\.\d{2,4})?(?:,?)[百千万亿]?))*")
  69. all_match = re.findall(money_patten,"".join(tokens))
  70. index = 0
  71. for i in range(len(all_match)):
  72. if len(all_match[i][0])>0:
  73. unit = ""
  74. if len(all_match[i][1])>0:
  75. entity_text = all_match[i][1]
  76. elif len(all_match[i][2])>0:
  77. entity_text = all_match[i][2]
  78. else:
  79. entity_text = all_match[i][4]
  80. unit = all_match[i][3]
  81. index += len(all_match[i][0])-len(entity_text)
  82. #entity_text = getUnifyMoney(all_match[i])
  83. for j in range(len(list_tokenbegin)):
  84. if list_tokenbegin[j]==index:
  85. begin_index = j
  86. break
  87. elif list_tokenbegin[j]>index:
  88. begin_index = j-1
  89. break
  90. index += len(str(entity_text))
  91. for j in range(len(list_tokenbegin)):
  92. if list_tokenbegin[j]>=index:
  93. end_index = j-1
  94. break
  95. print("".join(tokens))
  96. print(entity_text)
  97. print(unit)
  98. if len(unit)>0:
  99. entity_text = getUnifyMoney(entity_text)*getMultipleFactor(unit)
  100. else:
  101. entity_text = getUnifyMoney(entity_text)
  102. print(entity_text,begin_index,end_index,index)
  103. else:
  104. index += 1