123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- import sys
- import os
- sys.path.append(os.path.abspath("../.."))
- import pandas as pd
- import re
- # from BiddingKG.dl.interface import Entitys
- def re_rule():
- data = pd.read_csv("C:\\Users\\admin\\Desktop\\alldata_error.csv", index_col=0)
- rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
- "(?P<moneySource>([^,,。;;已]{,20}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
- "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]{,20}(资本[金]|资金|自筹|贷款|补助|拨款|"
- "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
- num = 0
- moneySourceList = []
- re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)")
- re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
- r"(?P<moneySource>[^,,。;;已]{2,}?)[,。;,]")
- re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析")
- sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
- "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|"
- "及预算控制金额|及预算金额|及预算|及预算价|为|出资比例|^资金$|\d[\d.,,]*万?元|来[自源]?")
- for text,test_res in zip(data['text'],data['test']):
- text = str(text)
- moneySource = []
- results = []
- if re1.search(text):
- print(str(num)+'==> ',test_res)
- text_split = re1.split(text)[1:]
- new_split = []
- index = 0
- while index<len(text_split):
- new = text_split[index]+text_split[index+1]
- new_split.append(new)
- index += 2
- for item in new_split:
- print('item:',item)
- # print( rule.search(item).groupdict())
- # print('')
- if rule.search(item):
- groupdict1 = rule.search(item).groupdict()
- source1 = groupdict1['moneySource']
- print("source1: ",source1)
- print(groupdict1)
- if source1:
- results.append(groupdict1)
- if len(results)==0:
- for item in new_split:
- if re2.search(item):
- groupdict2 = re2.search(item).groupdict()
- source3 = groupdict2['moneySource']
- # print("source3==>",source3)
- if not re_error.search(source3):
- results.append(groupdict2)
- sign = 0
- aaa = re.compile(r"来[源自]")
- for result in results:
- if aaa.search(result['start']):
- moneySource.append(sub.sub("",result['moneySource']))
- sign += 1
- if len(results)>0 and sign==0:
- for result in results:
- moneySource.append(sub.sub("",result['moneySource']))
- moneySource = list(set(moneySource))
- moneySourceList.append(moneySource)
- print('moneySource:==>',moneySource)
- num += 1
- data['myResult'] = moneySourceList
- # data.to_csv("C:\\Users\\admin\\Desktop\\source2.csv")
- def extract_moneySource(text):
- rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
- "(?P<moneySource>([^,,。;;已]{,30}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
- "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]{,30}(资本[金]|资金|自筹|贷款|补助|拨款|"
- "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
- re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)")
- re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
- r"(?P<moneySource>[^,,。;;已]{4,}?)[,。;,]")
- re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析|是否")
- sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
- "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|"
- "及预算控制金额|及预算金额|及预算|及预算价|为|出资比例|^资金$|\d[\d.,,]*万?元|来[自源]?")
- text = str(text)
- moneySource = []
- results = []
- if re1.search(text):
- text_split = re1.split(text)
- word_index = len(text_split[0])
- copy_index = word_index
- text_split = text_split[1:]
- new_split = []
- index = 0
- while index < len(text_split):
- new = text_split[index] + text_split[index + 1]
- new_split.append(new)
- index += 2
- for item in new_split:
- # print('item:', item)
- res = rule.search(item)
- if res:
- groupdict1 = res.groupdict()
- source1 = groupdict1['moneySource']
- # print('group:',res.group())
- # print("source1: ", source1)
- # print(groupdict1)
- if source1:
- groupdict1["index"] = word_index
- groupdict1["prob"] = 0.9
- # print(groupdict1['index'])
- results.append(groupdict1)
- word_index += len(item)
- # print(word_index)
- if len(results) == 0:
- for item in new_split:
- res = re2.search(item)
- if res:
- groupdict2 = res.groupdict()
- source2 = groupdict2['moneySource']
- # print("source2==>",source2)
- if source2 and not re_error.search(res.group()):
- groupdict2["index"] = copy_index
- groupdict2["prob"] = 0.8
- results.append(groupdict2)
- copy_index += len(item)
- first = []
- second = []
- re_first = re.compile(r"来[源自]")
- for result in results:
- if re_first.search(result['start']):
- first.append(result)
- else:
- second.append(result)
- if len(first) == 0 :
- first = second
- # print(first)
- list_moneySource = []
- for result in first:
- entity_text = sub.sub("",result['moneySource'])
- # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
- if entity_text is None or len(entity_text)>40:
- continue
- else:
- wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
- wordOffset_end = wordOffset_begin + len(entity_text)
- # print(entity_text,wordOffset_begin,wordOffset_end)
- _moneySource = dict()
- _moneySource['body'] = entity_text
- _moneySource['begin_index'] = wordOffset_begin
- _moneySource['end_index'] = wordOffset_end
- _moneySource['prob'] = result['prob']
- # print(_moneySource)
- list_moneySource.append(_moneySource)
- return list_moneySource
- if __name__ == '__main__':
- # re_rule()
- test ="a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。" \
- "1、采购内容及资金来源:采购内容为汉上实验学校采购24台3匹柜机空调。资金来源为财政资金。"
- # test = ",资金来源是否都是要具体到每条来源明细,"
- # 11,23 35,37
- print(extract_moneySource(test))
- pass
|