luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
							import sys
import os
sys.path.append(os.path.abspath("../.."))
import pandas as pd
import re
# from BiddingKG.dl.interface import Entitys

def re_rule():

    data = pd.read_csv("C:\\Users\\admin\\Desktop\\alldata_error.csv", index_col=0)

    rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来，?源[为于]?|来自于?)?(，|,|；+|：+)?)" 
              "(?P<moneySource>([^,，。;；已]{,20}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([:：.、\d]+%)?[，,;；]?)?([^,，.。;；已]{,20}(资本[金]|资金|自筹|贷款|补助|拨款|"
               "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[:：.、\d]*%[，,;；]?)*)")
    num = 0
    moneySourceList = []
    re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已，,。.;；]|资[金佥]性质)")
    re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来，?源[为于]?|来自于?)?(，|,|；+|：+)?)"
                     r"(?P<moneySource>[^,，。;；已]{2,}?)[，。；,]")
    re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析")

    sub = re.compile("[：:。]|^[.,，、\)]|[,，;；]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
                     "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|"
                     "及预算控制金额|及预算金额|及预算|及预算价|为|出资比例|^资金$|\d[\d.,，]*万?元|来[自源]?")
    for text,test_res in zip(data['text'],data['test']):
        text = str(text)
        moneySource = []
        results = []
        if re1.search(text):
            print(str(num)+'==>  ',test_res)
            text_split = re1.split(text)[1:]
            new_split = []
            index = 0
            while index<len(text_split):
                new = text_split[index]+text_split[index+1]
                new_split.append(new)
                index += 2
            for item in new_split:
                print('item:',item)
                # print( rule.search(item).groupdict())
                # print('')
                if rule.search(item):
                    groupdict1 = rule.search(item).groupdict()
                    source1 = groupdict1['moneySource']
                    print("source1:  ",source1)
                    print(groupdict1)
                    if source1:
                        results.append(groupdict1)


            if len(results)==0:
                for item in new_split:
                    if re2.search(item):
                        groupdict2 = re2.search(item).groupdict()
                        source3 = groupdict2['moneySource']
                        # print("source3==>",source3)
                        if not re_error.search(source3):
                            results.append(groupdict2)

        sign = 0
        aaa = re.compile(r"来[源自]")
        for result in results:
            if aaa.search(result['start']):
                moneySource.append(sub.sub("",result['moneySource']))
                sign += 1
        if len(results)>0 and sign==0:
            for result in results:
                moneySource.append(sub.sub("",result['moneySource']))
        moneySource = list(set(moneySource))
        moneySourceList.append(moneySource)
        print('moneySource:==>',moneySource)
        num += 1
    data['myResult'] = moneySourceList
    # data.to_csv("C:\\Users\\admin\\Desktop\\source2.csv")

def extract_moneySource(text):
    rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来，?源[为于]?|来自于?)?(，|,|；+|：+)?)"
                      "(?P<moneySource>([^,，。;；已]{,30}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([:：.、\d]+%)?[，,;；]?)?([^,，.。;；已]{,30}(资本[金]|资金|自筹|贷款|补助|拨款|"
                      "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[:：.、\d]*%[，,;；]?)*)")

    re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已，,。.;；]|资[金佥]性质)")
    re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来，?源[为于]?|来自于?)?(，|,|；+|：+)?)"
                     r"(?P<moneySource>[^,，。;；已]{4,}?)[，。；,]")
    re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析|是否")

    sub = re.compile("[：:。]|^[.,，、\)]|[,，;；]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
                     "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|"
                     "及预算控制金额|及预算金额|及预算|及预算价|为|出资比例|^资金$|\d[\d.,，]*万?元|来[自源]?")

    text = str(text)
    moneySource = []
    results = []
    if re1.search(text):
        text_split = re1.split(text)
        word_index = len(text_split[0])
        copy_index = word_index
        text_split = text_split[1:]
        new_split = []
        index = 0
        while index < len(text_split):
            new = text_split[index] + text_split[index + 1]
            new_split.append(new)
            index += 2

        for item in new_split:
            # print('item:', item)
            res = rule.search(item)
            if res:
                groupdict1 = res.groupdict()
                source1 = groupdict1['moneySource']
                # print('group:',res.group())
                # print("source1:  ", source1)
                # print(groupdict1)
                if source1:
                    groupdict1["index"] = word_index
                    groupdict1["prob"] = 0.9
                    # print(groupdict1['index'])
                    results.append(groupdict1)
            word_index += len(item)
            # print(word_index)

        if len(results) == 0:
            for item in new_split:
                res = re2.search(item)
                if res:
                    groupdict2 = res.groupdict()
                    source2 = groupdict2['moneySource']
                    # print("source2==>",source2)
                    if source2 and not re_error.search(res.group()):
                        groupdict2["index"] = copy_index
                        groupdict2["prob"] = 0.8
                        results.append(groupdict2)
                copy_index += len(item)
    first = []
    second = []
    re_first = re.compile(r"来[源自]")
    for result in results:
        if re_first.search(result['start']):
            first.append(result)
        else:
            second.append(result)
    if len(first) == 0 :
        first = second
    # print(first)


    list_moneySource = []
    for result in first:
        entity_text = sub.sub("",result['moneySource'])
        # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
        if entity_text is None or len(entity_text)>40:
            continue
        else:
            wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
            wordOffset_end = wordOffset_begin + len(entity_text)
            # print(entity_text,wordOffset_begin,wordOffset_end)
            _moneySource = dict()
            _moneySource['body'] = entity_text
            _moneySource['begin_index'] = wordOffset_begin
            _moneySource['end_index'] = wordOffset_end
            _moneySource['prob'] = result['prob']
            # print(_moneySource)
            list_moneySource.append(_moneySource)
    return list_moneySource


if __name__ == '__main__':
    # re_rule()
    test ="a建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，as，建设资金来自呜呜呜。" \
          "1、采购内容及资金来源：采购内容为汉上实验学校采购24台3匹柜机空调。资金来源为财政资金。"
    # test = "，资金来源是否都是要具体到每条来源明细，"
    # 11,23 35,37
    print(extract_moneySource(test))
    pass