import sys import os sys.path.append(os.path.abspath("../..")) import pandas as pd import re # from BiddingKG.dl.interface import Entitys def re_rule(): data = pd.read_csv("C:\\Users\\admin\\Desktop\\alldata_error.csv", index_col=0) rule = re.compile("(?P(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)" "(?P([^,,。;;已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]*(资本[金]|资金|自筹|贷款|补助|拨款|" "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)") num = 0 moneySourceList = [] re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)") re2 = re.compile(r"(?P(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)" r"(?P[^,,。;;已]{2,}?)[,。;,]") re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析") sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|" "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|" "及预算控制金额|及预算金额|及预算|及预算价|为|出资比例|^资金$|\d[\d.,,]*万?元|来[自源]?") for text,test_res in zip(data['text'],data['test']): text = str(text) moneySource = [] results = [] if re1.search(text): print(str(num)+'==> ',test_res) text_split = re1.split(text)[1:] new_split = [] index = 0 while index",source3) if not re_error.search(source3): results.append(groupdict2) sign = 0 aaa = re.compile(r"来[源自]") for result in results: if aaa.search(result['start']): moneySource.append(sub.sub("",result['moneySource'])) sign += 1 if len(results)>0 and sign==0: for result in results: moneySource.append(sub.sub("",result['moneySource'])) moneySource = list(set(moneySource)) moneySourceList.append(moneySource) print('moneySource:==>',moneySource) num += 1 data['myResult'] = moneySourceList # data.to_csv("C:\\Users\\admin\\Desktop\\source2.csv") def extract_moneySource(text): rule = re.compile("(?P(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)" "(?P([^,,。;;已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]*(资本[金]|资金|自筹|贷款|补助|拨款|" "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)") re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)") re2 = re.compile(r"(?P(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)" r"(?P[^,,。;;已]{2,}?)[,。;,]") re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析|是否") sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|" "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|" "及预算控制金额|及预算金额|及预算|及预算价|为|出资比例|^资金$|\d[\d.,,]*万?元|来[自源]?") text = str(text) moneySource = [] results = [] if re1.search(text): text_split = re1.split(text) word_index = len(text_split[0]) copy_index = word_index text_split = text_split[1:] new_split = [] index = 0 while index < len(text_split): new = text_split[index] + text_split[index + 1] new_split.append(new) index += 2 for item in new_split: # print('item:', item) res = rule.search(item) if res: groupdict1 = res.groupdict() source1 = groupdict1['moneySource'] # print('group:',res.group()) # print("source1: ", source1) # print(groupdict1) if source1: groupdict1["index"] = word_index # print(groupdict1['index']) results.append(groupdict1) word_index += len(item) # print(word_index) if len(results) == 0: for item in new_split: res = re2.search(item) if res: groupdict2 = res.groupdict() source2 = groupdict2['moneySource'] # print("source2==>",source2) if source2 and not re_error.search(source2): groupdict2["index"] = copy_index results.append(groupdict2) copy_index += len(item) first = [] second = [] re_first = re.compile(r"来[源自]") for result in results: if re_first.search(result['start']): first.append(result) else: second.append(result) if len(first) == 0 : first = second # print(first) list_moneySource = [] for result in first: entity_text = sub.sub("",result['moneySource']) # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start() if entity_text is None: continue else: wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text) wordOffset_end = wordOffset_begin + len(entity_text) # print(entity_text,wordOffset_begin,wordOffset_end) _moneySource = dict() _moneySource['body'] = entity_text _moneySource['begin_index'] = wordOffset_begin _moneySource['end_index'] = wordOffset_end # print(_moneySource) list_moneySource.append(_moneySource) return list_moneySource if __name__ == '__main__': # re_rule() test ="a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。" \ "1、采购内容及资金来源:采购内容为汉上实验学校采购24台3匹柜机空调。资金来源为财政资金。" # 11,23 35,37 print(extract_moneySource(test)) pass