ruleExtra.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. import sys
  2. import os
  3. sys.path.append(os.path.abspath("../.."))
  4. import pandas as pd
  5. import re
  6. # from BiddingKG.dl.interface import Entitys
  7. def re_rule():
  8. data = pd.read_csv("C:\\Users\\admin\\Desktop\\alldata_error.csv", index_col=0)
  9. rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
  10. "(?P<moneySource>([^,,。;;已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
  11. "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
  12. "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
  13. num = 0
  14. moneySourceList = []
  15. re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)")
  16. re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
  17. r"(?P<moneySource>[^,,。;;已]{2,}?)[,。;,]")
  18. re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析")
  19. sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
  20. "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|"
  21. "及预算控制金额|及预算金额|及预算|及预算价|为|出资比例|^资金$|\d[\d.,,]*万?元|来[自源]?")
  22. for text,test_res in zip(data['text'],data['test']):
  23. text = str(text)
  24. moneySource = []
  25. results = []
  26. if re1.search(text):
  27. print(str(num)+'==> ',test_res)
  28. text_split = re1.split(text)[1:]
  29. new_split = []
  30. index = 0
  31. while index<len(text_split):
  32. new = text_split[index]+text_split[index+1]
  33. new_split.append(new)
  34. index += 2
  35. for item in new_split:
  36. print('item:',item)
  37. # print( rule.search(item).groupdict())
  38. # print('')
  39. if rule.search(item):
  40. groupdict1 = rule.search(item).groupdict()
  41. source1 = groupdict1['moneySource']
  42. print("source1: ",source1)
  43. print(groupdict1)
  44. if source1:
  45. results.append(groupdict1)
  46. if len(results)==0:
  47. for item in new_split:
  48. if re2.search(item):
  49. groupdict2 = re2.search(item).groupdict()
  50. source3 = groupdict2['moneySource']
  51. # print("source3==>",source3)
  52. if not re_error.search(source3):
  53. results.append(groupdict2)
  54. sign = 0
  55. aaa = re.compile(r"来[源自]")
  56. for result in results:
  57. if aaa.search(result['start']):
  58. moneySource.append(sub.sub("",result['moneySource']))
  59. sign += 1
  60. if len(results)>0 and sign==0:
  61. for result in results:
  62. moneySource.append(sub.sub("",result['moneySource']))
  63. moneySource = list(set(moneySource))
  64. moneySourceList.append(moneySource)
  65. print('moneySource:==>',moneySource)
  66. num += 1
  67. data['myResult'] = moneySourceList
  68. # data.to_csv("C:\\Users\\admin\\Desktop\\source2.csv")
  69. def extract_moneySource(text):
  70. rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
  71. "(?P<moneySource>([^,,。;;已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
  72. "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
  73. "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
  74. re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)")
  75. re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
  76. r"(?P<moneySource>[^,,。;;已]{2,}?)[,。;,]")
  77. re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析|是否")
  78. sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
  79. "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|"
  80. "及预算控制金额|及预算金额|及预算|及预算价|为|出资比例|^资金$|\d[\d.,,]*万?元|来[自源]?")
  81. text = str(text)
  82. moneySource = []
  83. results = []
  84. if re1.search(text):
  85. text_split = re1.split(text)
  86. word_index = len(text_split[0])
  87. copy_index = word_index
  88. text_split = text_split[1:]
  89. new_split = []
  90. index = 0
  91. while index < len(text_split):
  92. new = text_split[index] + text_split[index + 1]
  93. new_split.append(new)
  94. index += 2
  95. for item in new_split:
  96. # print('item:', item)
  97. res = rule.search(item)
  98. if res:
  99. groupdict1 = res.groupdict()
  100. source1 = groupdict1['moneySource']
  101. # print('group:',res.group())
  102. # print("source1: ", source1)
  103. # print(groupdict1)
  104. if source1:
  105. groupdict1["index"] = word_index
  106. # print(groupdict1['index'])
  107. results.append(groupdict1)
  108. word_index += len(item)
  109. # print(word_index)
  110. if len(results) == 0:
  111. for item in new_split:
  112. res = re2.search(item)
  113. if res:
  114. groupdict2 = res.groupdict()
  115. source2 = groupdict2['moneySource']
  116. # print("source2==>",source2)
  117. if source2 and not re_error.search(source2):
  118. groupdict2["index"] = copy_index
  119. results.append(groupdict2)
  120. copy_index += len(item)
  121. first = []
  122. second = []
  123. re_first = re.compile(r"来[源自]")
  124. for result in results:
  125. if re_first.search(result['start']):
  126. first.append(result)
  127. else:
  128. second.append(result)
  129. if len(first) == 0 :
  130. first = second
  131. # print(first)
  132. list_moneySource = []
  133. for result in first:
  134. entity_text = sub.sub("",result['moneySource'])
  135. # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
  136. if entity_text is None:
  137. continue
  138. else:
  139. wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
  140. wordOffset_end = wordOffset_begin + len(entity_text)
  141. # print(entity_text,wordOffset_begin,wordOffset_end)
  142. _moneySource = dict()
  143. _moneySource['body'] = entity_text
  144. _moneySource['begin_index'] = wordOffset_begin
  145. _moneySource['end_index'] = wordOffset_end
  146. # print(_moneySource)
  147. list_moneySource.append(_moneySource)
  148. return list_moneySource
  149. if __name__ == '__main__':
  150. # re_rule()
  151. test ="a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。" \
  152. "1、采购内容及资金来源:采购内容为汉上实验学校采购24台3匹柜机空调。资金来源为财政资金。"
  153. # 11,23 35,37
  154. print(extract_moneySource(test))
  155. pass