ruleExtra.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. import sys
  2. import os
  3. sys.path.append(os.path.abspath("../.."))
  4. import pandas as pd
  5. import re
  6. # from BiddingKG.dl.interface import Entitys
  7. def re_rule():
  8. data = pd.read_csv("C:\\Users\\admin\\Desktop\\alldata_error.csv", index_col=0)
  9. rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
  10. "(?P<moneySource>([^,,。;;已]{,20}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
  11. "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]{,20}(资本[金]|资金|自筹|贷款|补助|拨款|"
  12. "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
  13. num = 0
  14. moneySourceList = []
  15. re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)")
  16. re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
  17. r"(?P<moneySource>[^,,。;;已]{2,}?)[,。;,]")
  18. re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析")
  19. sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
  20. "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|"
  21. "及预算控制金额|及预算金额|及预算|及预算价|为|出资比例|^资金$|\d[\d.,,]*万?元|来[自源]?")
  22. for text,test_res in zip(data['text'],data['test']):
  23. text = str(text)
  24. moneySource = []
  25. results = []
  26. if re1.search(text):
  27. print(str(num)+'==> ',test_res)
  28. text_split = re1.split(text)[1:]
  29. new_split = []
  30. index = 0
  31. while index<len(text_split):
  32. new = text_split[index]+text_split[index+1]
  33. new_split.append(new)
  34. index += 2
  35. for item in new_split:
  36. print('item:',item)
  37. # print( rule.search(item).groupdict())
  38. # print('')
  39. if rule.search(item):
  40. groupdict1 = rule.search(item).groupdict()
  41. source1 = groupdict1['moneySource']
  42. print("source1: ",source1)
  43. print(groupdict1)
  44. if source1:
  45. results.append(groupdict1)
  46. if len(results)==0:
  47. for item in new_split:
  48. if re2.search(item):
  49. groupdict2 = re2.search(item).groupdict()
  50. source3 = groupdict2['moneySource']
  51. # print("source3==>",source3)
  52. if not re_error.search(source3):
  53. results.append(groupdict2)
  54. sign = 0
  55. aaa = re.compile(r"来[源自]")
  56. for result in results:
  57. if aaa.search(result['start']):
  58. moneySource.append(sub.sub("",result['moneySource']))
  59. sign += 1
  60. if len(results)>0 and sign==0:
  61. for result in results:
  62. moneySource.append(sub.sub("",result['moneySource']))
  63. moneySource = list(set(moneySource))
  64. moneySourceList.append(moneySource)
  65. print('moneySource:==>',moneySource)
  66. num += 1
  67. data['myResult'] = moneySourceList
  68. # data.to_csv("C:\\Users\\admin\\Desktop\\source2.csv")
  69. def extract_moneySource(text):
  70. rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
  71. "(?P<moneySource>([^,,。;;已]{,30}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
  72. "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]{,30}(资本[金]|资金|自筹|贷款|补助|拨款|"
  73. "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
  74. re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)")
  75. re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
  76. r"(?P<moneySource>[^,,。;;已]{4,}?)[,。;,]")
  77. re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析|是否")
  78. sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
  79. "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|"
  80. "及预算控制金额|及预算金额|及预算|及预算价|为|出资比例|^资金$|\d[\d.,,]*万?元|来[自源]?")
  81. text = str(text)
  82. moneySource = []
  83. results = []
  84. if re1.search(text):
  85. text_split = re1.split(text)
  86. word_index = len(text_split[0])
  87. copy_index = word_index
  88. text_split = text_split[1:]
  89. new_split = []
  90. index = 0
  91. while index < len(text_split):
  92. new = text_split[index] + text_split[index + 1]
  93. new_split.append(new)
  94. index += 2
  95. for item in new_split:
  96. # print('item:', item)
  97. res = rule.search(item)
  98. if res:
  99. groupdict1 = res.groupdict()
  100. source1 = groupdict1['moneySource']
  101. # print('group:',res.group())
  102. # print("source1: ", source1)
  103. # print(groupdict1)
  104. if source1:
  105. groupdict1["index"] = word_index
  106. groupdict1["prob"] = 0.9
  107. # print(groupdict1['index'])
  108. results.append(groupdict1)
  109. word_index += len(item)
  110. # print(word_index)
  111. if len(results) == 0:
  112. for item in new_split:
  113. res = re2.search(item)
  114. if res:
  115. groupdict2 = res.groupdict()
  116. source2 = groupdict2['moneySource']
  117. # print("source2==>",source2)
  118. if source2 and not re_error.search(res.group()):
  119. groupdict2["index"] = copy_index
  120. groupdict2["prob"] = 0.8
  121. results.append(groupdict2)
  122. copy_index += len(item)
  123. first = []
  124. second = []
  125. re_first = re.compile(r"来[源自]")
  126. for result in results:
  127. if re_first.search(result['start']):
  128. first.append(result)
  129. else:
  130. second.append(result)
  131. if len(first) == 0 :
  132. first = second
  133. # print(first)
  134. list_moneySource = []
  135. for result in first:
  136. entity_text = sub.sub("",result['moneySource'])
  137. # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
  138. if entity_text is None or len(entity_text)>40:
  139. continue
  140. else:
  141. wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
  142. wordOffset_end = wordOffset_begin + len(entity_text)
  143. # print(entity_text,wordOffset_begin,wordOffset_end)
  144. _moneySource = dict()
  145. _moneySource['body'] = entity_text
  146. _moneySource['begin_index'] = wordOffset_begin
  147. _moneySource['end_index'] = wordOffset_end
  148. _moneySource['prob'] = result['prob']
  149. # print(_moneySource)
  150. list_moneySource.append(_moneySource)
  151. return list_moneySource
  152. if __name__ == '__main__':
  153. # re_rule()
  154. test ="a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。" \
  155. "1、采购内容及资金来源:采购内容为汉上实验学校采购24台3匹柜机空调。资金来源为财政资金。"
  156. # test = ",资金来源是否都是要具体到每条来源明细,"
  157. # 11,23 35,37
  158. print(extract_moneySource(test))
  159. pass