luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
							import sys
import os
sys.path.append(os.path.abspath("../.."))
import pandas as pd
import re
from BiddingKG.dl.common.Utils import *
from BiddingKG.dl.interface.Entitys import *
from BiddingKG.dl.interface.predictor import *
from BiddingKG.dl.foolnltk import selffool
from BiddingKG.dl.interface.Preprocessing import *

def get_data1():
    load1 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_TOU_SU_CHU_LI.csv")
    load2 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_WEI_FA_JI_LU.csv")
    load3 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_QI_TA_SHI_XIN.csv")
    load = pd.concat([load1, load2, load3], axis=0)
    load = load.reset_index(drop=True)
    load['PAGE_CONTENT'] = get_article1(load['PAGE_CONTENT'])
    sentences_list = get_sentences1(load['PAGE_CONTENT'])
    load['sentences'] = ['*#*>'.join(_sentences) for _sentences in sentences_list ]
    load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv")

def get_ners():
    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
    # data = data.head(3)
    nersList = []
    for index,_sentences in zip(data.index,data['sentences']):
        _sentences = _sentences.split('*#*>')
        _ners = getNers(_sentences,useselffool=True)
        word_index = 0
        for ners,sentence in zip(_ners, _sentences):
            if len(ners) != 0:
                word_ner_list = ['O']*len(sentence)

                for ner in ners:
                    nerDict = dict()
                    entity_type = ner[2]
                    nerDict['entity_type'] = entity_type
                    entity_text = ner[3]
                    nerDict['entity_text'] = entity_text
                    begin_index = ner[0]
                    nerDict['begin_index'] = begin_index
                    end_index = ner[1] - 1
                    nerDict['end_index'] = end_index
                    wordOffset_begin = word_index + begin_index
                    nerDict['wordOffset_begin'] = wordOffset_begin
                    wordOffset_end = wordOffset_begin + len(entity_text)
                    nerDict['wordOffset_end'] = wordOffset_end
                    nerDict['sentence'] = sentence
                    nerDict['article_index'] = index
                    # print('====')
                    # print(begin_index,end_index,entity_type,entity_text)
                    nersList.append(nerDict)
                    # print(nerDict)
                    word_ner_list[begin_index] = 'B'
                    word_ner_list[begin_index+1:end_index] = ['I']*(end_index-begin_index-1)
            word_index += len(sentence)
    # save(nersList,"nersList.pk")

# 相邻的（org、company）（person）合并
def get_unionNers():
    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
    ners = load("nersList.pk")
    org_companys = [[] for _ in range(len(data))]
    type1 = ['org', 'company', 'union_oc']
    persons = [[] for _ in range(len(data))]
    type2 = ['person', 'union_person']
    for ner in ners:
        if ner['entity_type'] in type1:
            org_companys[ner['article_index']].append(ner)
        if ner['entity_type'] in type2:
            persons[ner['article_index']].append(ner)
    # 合并 org 和 company
    new_org_companys = []
    for org_company in org_companys:
        if org_company and len(org_company) > 1:
            union_nums = 0
            for i in range(len(org_company)-1):
                if org_company[i]['end_index'] == org_company[i + 1]['begin_index'] - 1 and org_company[i]['sentence'][org_company[i]['end_index']] == '、' \
                        and org_company[i]['sentence'] == org_company[i + 1]['sentence']:
                    # print(1)
                    org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
                    org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
                    org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
                    # print(org_company[i + 1]['entity_text'])
                    org_company[i] = 0
                    union_nums += 1
                elif org_company[i]['end_index'] == org_company[i + 1]['begin_index'] and org_company[i]['sentence'] == org_company[i+1]['sentence']:
                    # print(2)
                    org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
                    org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
                    org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
                    # print(org_company[i + 1]['entity_text'])
                    org_company[i] = 0
                    union_nums += 1
            for _ in range(union_nums):
                org_company.remove(0)
        new_org_companys.append(org_company)
    # 合并person
    new_persons = []
    for person in persons:
        if person and len(person) > 1:
            union_nums = 0
            for i in range(len(person) - 1):
                if person[i]['end_index'] == person[i + 1]['begin_index'] - 1 and person[i]['sentence'][person[i]['end_index']] == '、' \
                        and person[i]['sentence'] == person[i + 1]['sentence']:
                    # print(1)
                    person[i + 1]['begin_index'] = person[i]['begin_index']
                    person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
                    person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
                    # print(person[i + 1]['entity_text'])
                    person[i] = 0
                    union_nums += 1
                elif person[i]['end_index'] == person[i + 1]['begin_index'] and person[i]['sentence'] == person[i + 1]['sentence']:
                    # print(2)
                    person[i + 1]['begin_index'] = person[i]['begin_index']
                    person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
                    person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
                    # print(person[i + 1]['entity_text'])
                    person[i] = 0
                    union_nums += 1
            for _ in range(union_nums):
                person.remove(0)
        new_persons.append(person)
    # save([new_org_companys,new_persons],"unionNers.pk")

def test02():
    load = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)

    text_rule = re.compile("监管调查|通报|不诚信|监督检查|不良|投诉|质疑|处罚|违法|违规|不予[受处]理|处理")
    title_rule = re.compile("中标公告|中标[（\(]成交[\)）]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
                            "|补贴公[示告]|废标公[示告]")
    # need_index = []
    # for index, title, text in zip(load.index, load['PAGE_TITLE'], load['PAGE_CONTENT']):
    #     a = 0
    #     if text_rule.search(text):
    #         a = 1
    #     if title_rule.search(title):
    #         a = 0
    #     if text_rule.search(title):
    #         a = 1
    #     if a:
    #         need_index.append(index)
    # print(len(need_index))
    # load = load.loc[need_index]
    # print(len(load))
    # load = load.reset_index(drop=True)

    complainants_rule1 = re.compile("[^被]投[诉拆][人方]之?[\d一二三四五六七八九十]?(?:（.+?）)?[：:]+?")
    complaint_rule = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|疑问[人方]|检举[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?名称)?[:：]+")
    complainants_list = []
    a = 1
    load = load[9744:9745]
    for article,sentences in zip(load['PAGE_CONTENT'],load['sentences']):
        print(a)
        a+=1
        getSentences = sentences.split('*#*>')
        # print(getSentences)
        ners = getNers(getSentences,useselffool=True)
        print(ners)
        print('======================')
        word_index = 0
        ners_list = []
        for ner,sentence in zip(ners,getSentences):
            size = 16
            complainants = []
            if len(ner)!=0:
                for aner in ner:

                    entity_type = aner[2]
                    entity_text = aner[3]
                    # begin = word_index + aner[0]
                    # end = begin + len(entity_text)
                    # 投诉人
                    if entity_type in ['org','company','person']:
                        left = sentence[max(0, aner[0] - size):aner[0]]

                        print(entity_text,left,sentence)
                        if complaint_rule.search(left):
                            print('yes')
                            entity_type = 'complainant'
                            complainants.append(entity_text)
                    # ners_list.append([begin, end, entity_type, entity_text])
            word_index += len(sentence)
        complainants_list.append(complainants)

        # test
        # for i in ners_list:
        #     print(i[3])
        #     print(processed[0][i[0]:i[1]])
    load['complainant'] = complainants_list
    # load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\test01.csv")

# 投诉人、被投诉人、被处罚人
def get_complainant():
    data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2.xlsx",index_col=0)
    # ners = load("nersList.pk")
    unionNers = load("unionNers.pk")
    ners = [i+j for i,j in zip(unionNers[0],unionNers[1])]
    complainants = [[] for _ in range(len(data))]
    punishPeople = [[] for _ in range(len(data))]
    a = ['org','company','person']
    size = 16
    # 投诉人、质疑人
    complainants_rule1 = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
    # 被处罚人，被投诉人
    punishPeople_rule1 = re.compile("(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
    punishPeople_rule2_1 = re.compile("，$")
    punishPeople_rule2_2 = re.compile("^[:：]")
    punishPeople_rule3_1 = re.compile("(?:关于|对)[^，。]*$")
    punishPeople_rule3_2 = re.compile("^[^，。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")

    time1 = time.time()
    for _ner in ners:
        if _ner:
            for ner in _ner:
                left = ner['sentence'][max(0,ner['begin_index']-size):ner['begin_index']]
                right = ner['sentence'][ner['end_index']:min(ner['end_index']+size,len(ner['sentence']))]
                # print(left)
                if complainants_rule1.search(left):
                    complainants[ner['article_index']].append(ner['entity_text'])
                elif punishPeople_rule1.search(left):
                    punishPeople[ner['article_index']].append(ner['entity_text'])
                elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
                    if data['类别'][ner['article_index']] == '投诉处理':
                        complainants[ner['article_index']].append(ner['entity_text'])
                    else:
                        punishPeople[ner['article_index']].append(ner['entity_text'])
                elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
                    punishPeople[ner['article_index']].append(ner['entity_text'])
    data['complainant'] = complainants
    data['punishPeople'] = punishPeople
    print(time.time()-time1)
    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx")

def get_complainant2(list_sentences, list_entitys, text_type):
    '''
    list_sentences: get_preprocessed() 得list_sentences
    list_entitys: get_preprocessed() 得list_entitys
    text_type: 文章类别（处罚类型）
    :return:
    complainants :投诉人列表
    punishPeople: 被投诉人/被处罚人
    '''
    sentences_list = list_sentences
    entitys_list = list_entitys
    size = 16
    a = ['org', 'company', 'person']
    b = ['org', 'company', 'union_org_company']
    c = ['person', 'union_person']
    need_entitys = []
    for entity in entitys_list:
        if entity.entity_type in a:
            need_entitys.append(entity)
    # 实体合并
    drop_count = 0
    for i in range(1, len(need_entitys)):
        entity = need_entitys[i]
        entity_begin = entity.wordOffset_begin
        entity_end = entity.wordOffset_end
        sentence = sentences_list[entity.sentence_index].sentence_text
        last_entity = need_entitys[i - 1]
        if entity.sentence_index == last_entity.sentence_index:
            if (entity.entity_type in b and last_entity.entity_type in b) or (
                    entity.entity_type in c and last_entity.entity_type in c):
                if entity_begin - last_entity.wordOffset_end < 2 and sentence[
                                                                     last_entity.wordOffset_end:entity_begin] in ['',
                                                                                                                  '、',
                                                                                                                  '和',
                                                                                                                  '及']:
                    need_entitys[i].wordOffset_begin = last_entity.wordOffset_begin
                    need_entitys[i].begin_index = last_entity.begin_index
                    need_entitys[i].entity_text = last_entity.entity_text + '+' + entity.entity_text
                    if entity.entity_type in b:
                        need_entitys[i].entity_type = 'union_org_company'
                    else:
                        need_entitys[i].entity_type = 'union_person'
                    need_entitys[i - 1] = 0
                    drop_count += 1
    for _ in range(drop_count):
        need_entitys.remove(0)
    # 投诉人、质疑人
    complainants_rule1 = re.compile(
        "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
    # 被处罚人，被投诉人
    punishPeople_rule1 = re.compile(
        "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
    punishPeople_rule2_1 = re.compile("，$")
    punishPeople_rule2_2 = re.compile("^[:：]")
    punishPeople_rule3_1 = re.compile("(?:关于|对)[^，。]*$")
    punishPeople_rule3_2 = re.compile("^[^，。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
    complainants = []
    punishPeople = []
    for i in range(len(need_entitys)):
        entity = need_entitys[i]
        entity_begin = entity.wordOffset_begin
        entity_end = entity.wordOffset_end

        # entity所在句子
        sentence = sentences_list[entity.sentence_index].sentence_text
        left = sentence[max(0, entity_begin - size):entity_begin]
        right = sentence[entity_end:min(entity_end + size, len(sentence))]

        if complainants_rule1.search(left):
            complainants.append(entity)
        elif punishPeople_rule1.search(left):
            punishPeople.append(entity)
        elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
            if text_type == '投诉处理':
                complainants.append(entity)
            else:
                punishPeople.append(entity)
        elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
            punishPeople.append(entity)

    result_complainants = []
    result_punishPeople = []
    for entity in complainants:
        if entity.entity_type in ['union_org_company', 'union_person']:
            entity_text = entity.entity_text.split('+')
            for item in entity_text:
                result_complainants.append(item)
        else:
            result_complainants.append(entity.entity_text)
    for entity in punishPeople:
        if entity.entity_type in ['union_org_company', 'union_person']:
            entity_text = entity.entity_text.split('+')
            for item in entity_text:
                result_punishPeople.append(item)
        else:
            result_punishPeople.append(entity.entity_text)
    return list(set(result_complainants)), list(set(result_punishPeople))

# 公告分类
def textClassify():
    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
    #投诉人|检举人|举报人|质疑人|质疑函
    patten1 = "投诉人|检举人|举报人|质疑人|质疑函|投诉处理|质疑单位"
    re1 = re.compile(patten1)
    patten2 = "不予[处受]理|撤诉|撤[销回]投诉|投诉终止"
    re2 = re.compile(patten2)
    patten3 = "关于[^，。]+?(?:处罚|通报|处理意见)|被处罚人|处罚决定|限制行为开始时间|处罚执行部门"
    re3 = re.compile(patten3)
    patten4 = "不良行为|不良信用|不良记录|不规范行为|不诚信行为"
    re4 = re.compile(patten4)
    patten5 = "行政处罚|行政处理|监督检查|监管调查|监督处理|违规处[罚理]|违法处[罚理]"
    re5 = re.compile(patten5)
    patten6 = "严重违法失信起名单|严重违法失信行为|严重违法失信企业"
    re6 = re.compile(patten6)
    patten7 = '处理决定'
    re7 = re.compile(patten7)
    patten8 = "处[理罚]依据|处罚日期|扣分依据|认定依据"
    re8 = re.compile(patten8)
    pos = []
    _type = []
    for title,text in zip(data['PAGE_TITLE'],data["PAGE_CONTENT"]):
        p = []
        t = ''
        if re1.search(text) or re1.search(title):
            p.append(patten1)
            t = '投诉'
        elif re2.search(text) and re.search('投诉',text):
            p.append('投诉+'+patten2)
            t = '投诉'
        elif re.search("回复",title):
            p.append("回复")
            t = '投诉'
        if len(p)==0:
            if re3.search(title) or re3.search(text):
                p.append(patten3)
                t = '处罚'
            elif re4.search(title):
                p.append(patten4)
                t = '处罚'
            elif re5.search(title) or re5.search(text):
                p.append(patten5)
                t = '处罚'
            elif re6.search(text) or re6.search(title):
                p.append(patten6)
                t = '处罚'
            elif re8.search(text):
                p.append(patten8)
                t = '处罚'
        if len(p) == 0:
            if re7.search(text) and re.search('投诉', text):
                p.append('投诉+' + patten7)
                t = '投诉'
            elif re7.search(text) or re7.search(title):
                p.append("处罚+"+patten7)
                t = '处罚'
        pos.append(p)
        _type.append(t)
    data['pos'] = pos
    data['type'] = _type
    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv")

# 投诉是否成立
def get_punishWhether01():
    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv",index_col=0)
    data = data[data['type']=='投诉']
    punishWhether_1 = re.compile("投诉[^。，,不]+?成立|投诉[^。，,]*[^不]属实|情况[^。，,]*[^不]属实|投诉成立|情况属实|予以支持")
    punishWhether_0 = re.compile("投诉[^。，,]*不能?成立|撤诉|[^逾将]{4,}不予[受处]理|撤[回销][^。，,]*(?:举报|投诉)|驳回[^。，,]*投诉|投诉终止|终止[^。，,]*投诉|情况[^。，,]*不属实|投诉[^。，,]*不属实|缺乏事实依据|不予支持|予以驳回")
    punishWhether = []
    punishDecision = []
    punishDecision_1 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[：:]((?:(?:[\d一二三四五六七八九十]|[\(（][\d一二三四五六七八九十][\)）]|投[诉拆]事项[\d一二三四五六七八九十]).+?。)+)")
    punishDecision_2 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[：:]([^。]+?(?:。|$))")
    punishDecision_3 = re.compile("[\d一二三四五六七八九十]、(?:处理，?意见|[裁决|处理]依据及结果|处理(?:决定|结果)|投诉处理决定)，(.+?)。[\d一二三四五六七八九十]、")
    punishDecision_4 = re.compile("(?:[\d一二三四五六七八九十]、处理，?意见|综上所述|[裁决|处理]依据及结果|综上|[\d一二三四五六七八九十]、处理(?:决定|结果)|经研究决定|[\d一二三四五六七八九十]、投诉处理决定)，([^。]+?(?:。|$))")
    punishDecision_5 = re.compile("(本机关决定|本机关认为|经审查.+?(?:。|$))")
    punishDecision_6 = re.compile("((?:依据|按照|根据|依照)[^：:。].+?(?:。|$))")

    def findDecision(text):
        decision = ''
        if punishDecision_1.search(text):
            decision = punishDecision_1.search(text).group(1)

        elif punishDecision_2.search(text):
            decision = punishDecision_2.search(text).group(1)
        elif punishDecision_3.search(text):
            decision = punishDecision_3.search(text).group(1)
        elif punishDecision_4.search(text):
            decision = punishDecision_4.findall(text)
            decision = decision[-1]
        elif punishDecision_5.search(text):
            decision = punishDecision_5.search(text).group(1)
        elif punishDecision_6.search(text):
            decision = punishDecision_6.findall(text)
            decision1 = decision[-1]
            if re.search("诉讼",decision1) and len(decision)>1:
                decision1 = decision[-2]
            decision = decision1
        return decision

    for text in data['PAGE_CONTENT']:
        pw = ''
        if punishWhether_1.search(text):
            pw = 1
        elif punishWhether_0.search(text):
            pw = 0
        punishWhether.append(pw)

        mid = len(text)//2
        lower_half = text[mid:]
        decision = findDecision(lower_half)
        if decision == '':
            decision = findDecision(text)

        # if punishDecision_1.search(text):
        #     decision = punishDecision_1.search(text).group(1)
        #
        # elif punishDecision_2.search(text):
        #     decision = punishDecision_2.search(text).group(1)
        # elif punishDecision_3.search(text):
        #     decision = punishDecision_3.search(text).group(1)
        # elif punishDecision_4.search(text):
        #     decision = punishDecision_4.findall(text)
        #     decision = decision[-1]
        # elif punishDecision_5.search(text):
        #     decision = punishDecision_5.findall(text)
        #     decision = decision[-1]
        punishDecision.append(decision)
    data['punishWhether'] = punishWhether
    data['punishDecision'] = punishDecision
    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishWhether&Decision.csv")
# 处罚决定
def get_punishDecision():
    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv", index_col=0)
    data = data[data['type'] == '处罚']
    punishDecision_1 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[:：]+((?:(?:[\d一二三四五六七八九十]|[\(（][\d一二三四五六七八九十][\)）]).+?。)+)")
    punishDecision_2 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[:：]+(.+?(?:。|$))")
    punishDecision_3 = re.compile("(扣分分?值[:：][\d.]+分?)")
    punishDecision_4 = re.compile("[\d一二三四五六七八九十]、(?:处理结果|处理决定|处理依据[和及]处理结果|处理依据及结果|处罚决定|处罚结果|整改意见)，(.+?)。[\d一二三四五六七八九十]、")
    punishDecision_5 = re.compile("(?:处理结果|[\d一二三四五六七八九十]、处理决定|处理依据及处理结果|处理依据及结果|经研究|经研究决定|[\d一二三四五六七八九十]、处罚决定|处罚结果|整改意见)，+(.+?(?:。|$))")
    punishDecision_6 = re.compile("(?:本机关决定|我局决定)(.+?(?:。|$))")
    punishDecision_7 = re.compile("((?:依据|按照|根据|依照)[^：:。].+?(?:。|$))")
    punishDecision = []
    for text in data['PAGE_CONTENT']:
        decision = ''
        if punishDecision_1.search(text):
            decision = punishDecision_1.search(text).group(1)
        elif punishDecision_2.search(text):
            decision = punishDecision_2.search(text).group(1)
        elif punishDecision_3.search(text):
            decision = punishDecision_3.search(text).group(1)
        elif punishDecision_4.search(text):
            decision = punishDecision_4.search(text).group(1)
        elif punishDecision_5.search(text):
            decision = punishDecision_5.findall(text)
            decision = decision[-1]
        elif punishDecision_6.search(text):
            decision = punishDecision_6.search(text).group(1)
        elif punishDecision_7.search(text):
            decision = punishDecision_7.findall(text)
            decision = decision[-1]
        punishDecision.append(decision)
    data['punishDecision'] = punishDecision
    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishDecision处罚.csv")

# 执法机构、处罚时间
def get_institution():
    data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx", index_col=0)
    ners = load("nersList.pk")
    orgs = [[] for _ in range(len(data))]
    times = [[] for _ in range(len(data))]
    institutions = [[] for _ in range(len(data))]
    punishTimes = [[] for _ in range(len(data))]
    institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[:：]")
    punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[:：]")
    for ner in ners:
        if ner['entity_type'] == 'org':
            left = ner['sentence'][max(0,ner['begin_index']-15):ner['begin_index']]
            if institution_1.search(left):
                institutions[ner['article_index']].append(ner['entity_text'])
            orgs[ner['article_index']].append(ner)
        elif ner['entity_type'] =='time':
            left = ner['sentence'][max(0, ner['begin_index'] - 15):ner['begin_index']]
            if punishTimes_1.search(left):
                punishTimes[ner['article_index']].append(ner['entity_text'])
            times[ner['article_index']].append(ner)
    orgs = [org[-5:] if len(org)>5 else org for org in orgs]
    times = [time[-3:] if len(time)>3 else time for time in times]
    data['org'] = orgs
    data['time'] = times
    data['institution'] = institutions
    data['punishTime'] = punishTimes
    # data = data[data['type'].isin(["投诉","处罚"])]
    print(len(data))
    # data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv")
    # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv", index_col=0)
    institution_list = []
    punishTime_list = []
    institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
    institution_time = re.compile("(^，?[\d一二三四五六七八九十]{4}，?[/年-][\d一二三四五六七八九十]{1,2}，?[/月-][\d一二三四五六七八九十]{1,2}，?[/日-]?)")
    for title,text,org,n_time,institution,punishTime in zip(data['PAGE_TITLE'],data['PAGE_CONTENT'],data['org'],data['time'],data['institution'],data['punishTime']):
        ins = ''
        ptime = ''
        if punishTime:
            ptime = punishTime
        if institution:
            ins = institution
        else:
            title_ners = getNers([title],useselffool=True)
            if title_ners[0]:

                for title_ner in title_ners[0]:

                    if title_ner[2]=='org' and institution_title.search(title_ner[3]):
                        # 'title:'+
                        ins = title_ner[3]
                        # print(title_ner[3])
                        break

        # if ins == '':
        for _org in org[::-1]:
            right = _org['sentence'][_org['end_index']:min(len(_org['sentence']),_org['end_index']+16)]
            if institution_time.search(right):
                if ins == '':
                    # "text_EndWithTime:" +
                    ins = _org['entity_text']
                if ptime == '':
                    # "text_EndWithIns:" +
                    ptime =institution_time.search(right).group(1)
                break
        if ptime == '' and len(n_time) != 0:
            textLong = len(text)
            if n_time[-1]['wordOffset_end'] > textLong-3 and len(n_time[-1]['entity_text'])>3:
                # "EndOfText:" +
                ptime = n_time[-1]['entity_text']

        institution_list.append(ins)
        punishTime_list.append(ptime)
    data['institution'] = institution_list
    data['punishTime'] = punishTime_list
    data = data.drop(columns=['org','time'],axis=1)
    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-2.xlsx")

# 处罚类型
def get_punishType():
    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
    # 暂定：严重违法失信，行政处罚，投诉处理，监督检查，其他失信记录

    # 其他无关公告
    title_rule = re.compile("(?:中标公[示告]|中标[（\(]成交[\)）]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
                            "|补贴公[示告]|废标公[示告]|备案公[示告]|数据统计|选取公告|流标公告|变更公告|入围公告|征集公告|执行情况|"
                            "登记公告|竞争性磋商公告|报名的公[示告]|竞争性谈判公告|邀请函|竞标公告|采购公告|招标公告|议标公告|预审公告|"
                            "询价公告|竞争性磋商（磋商）公告|竞[谈价]公告|合同公告|人员(名单)?公示|批复|终止公告|入围结果公告|中标结果公[示告]|"
                            "意见公示)(?:[\(（].+?[\)）])?$|关于.*通知(?:[^书]|$)")
    othertype = "其他无关公告"
    # 投诉处理
    re1_1 = re.compile("投诉[人方]|检举人|举报人[：:]|投诉处理|终止投诉|投诉终止|撤诉|撤回投诉|质疑人|质疑单位|质疑[^，,。]*答复")
    re1_2 = re.compile("处理决定|回复")
    re1_type = '投诉处理'
    # 监督检查
    re2 = re.compile("监督检查|监管调查|监督处理")
    re2_type = "监督检查"
    # 行政处罚
    re3 = re.compile("行政处罚|行政处理")
    re3_type = "行政处罚"
    # 严重违法失信
    re4 = re.compile("严重违法失信行为|严重违法失信企业|严重违法失信起名单")
    re4_type = "严重违法失信"
    # 其他失信公告
    re_other = re.compile("关于[^，。]+?(?:处罚|处理|通报)|不良行为|不良信用|不良记录|不规范行为|不诚信行为|"
                          "违[规法约]处[罚理]|处[理罚]依据|处罚日期|扣分依据|认定依据|处罚决定|违规情况|"
                          "违[规法]行为|违规事项|考评依据|失信行为")
    re_otherType = "其他失信公告"
    punishType_list = []
    for title,text in zip(data['PAGE_TITLE'],data['PAGE_CONTENT']):
        punishType = ''
        titleWithText = title + text
        if title_rule.search(title):
            punishType = othertype
        elif re1_1.search(titleWithText) or re.search("投[诉拆]",title):
            punishType = re1_type
        elif re1_2.search(titleWithText) and re.search("投诉",titleWithText):
            punishType = re1_type
        elif re2.search(titleWithText):
            punishType = re2_type
        elif re3.search(titleWithText):
            punishType = re3_type
        elif re4.search(titleWithText):
            punishType = re4_type
        elif re_other.search(titleWithText) or re.search("处罚",title):
            punishType = re_otherType
        punishType_list.append(punishType)
    data['punishType'] = punishType_list
    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishType_test.csv",encoding='utf-8')


def getNers_my(sentences,MAXAREA = 10000,useselffool=False):
    '''
    @param: sentences:句子数
    @return 限流执行后的实体识别list
    '''
    def getData(ners,process_data):
        process_sentences = [item[1] for item in process_data]
        print(process_data)
        if useselffool:
            ner_ = selffool.self_ner(process_sentences)
        else:
            ner_ = selffool.ner(process_sentences)
        print('ner_ :',ner_)
        for i in range(len(ner_)):
            the_index = process_data[i][0]
            ners[the_index] = ner_[i]
    sents = []
    for i in range(len(sentences)):
        sents.append([i,sentences[i]])
    sents.sort(key=lambda x:len(x[1]),reverse=True)
    print(sents)
    index_ = 0
    ners = [[]for i in range(len(sentences))]

    while(True):
        width = len(sents[index_][1])
        height = MAXAREA//width+1
        if height>len(sents)-index_:
            height = len(sents)-index_
        process_data = sents[index_:index_+height]
        getData( ners, process_data)
        index_ += height
        if index_>=len(sents):
            break
    return ners
# 网页公告处理
def get_article1(articles,cost_time = dict(),useselffool=True):
    '''
    :param articles: 待处理的article source html
    :param useselffool: 是否使用selffool
    :return: list_articles
    '''

    list_articles = []
    for article in articles:
        a_time = time.time()
        sourceContent = article
        #表格处理
        key_preprocess = "tableToText"
        start_time = time.time()
        article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))

        # log(article_processed)

        if key_preprocess not in cost_time:
            cost_time[key_preprocess] = 0
        cost_time[key_preprocess] += time.time()-start_time

        #article_processed = article[1]
        list_articles.append(article_processed)
        print(time.time()-a_time)
    return list_articles
# 分句处理
def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
    '''

    :param list_articles: 经过预处理的article text
    :return: list_sentences
    '''

    list_sentences = []
    for article in list_articles:
        a_time = time.time()
        list_sentences_temp = []
        #表格处理
        key_preprocess = "tableToText"
        start_time = time.time()
        article_processed = article


        if key_preprocess not in cost_time:
            cost_time[key_preprocess] = 0
        cost_time[key_preprocess] += time.time()-start_time

        #nlp处理
        if article_processed is not None and len(article_processed)!=0:
            split_patten = "。"
            sentences = []
            _begin = 0
            sentences_set = set()
            for _iter in re.finditer(split_patten,article_processed):
                _sen = article_processed[_begin:_iter.span()[1]]
                if len(_sen)>0 and _sen not in sentences_set:
                    sentences.append(_sen)
                    sentences_set.add(_sen)
                _begin = _iter.span()[1]
            _sen = article_processed[_begin:]
            if len(_sen)>0 and _sen not in sentences_set:
                sentences.append(_sen)
                sentences_set.add(_sen)
            # article = "".join(sentences)
            # # sentences.append(article_processed[_begin:])
            #
            # lemmas = []
            # doc_offsets = []
            # dep_types = []
            # dep_tokens = []
            #
            # time1 = time.time()

            '''
            tokens_all = fool.cut(sentences)
            #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
            #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
            ner_entitys_all = fool.ner(sentences)
            '''
            #限流执行
            key_nerToken = "nerToken"
            start_time = time.time()
            # tokens_all = getTokens(sentences,useselffool=useselffool)
            if key_nerToken not in cost_time:
                cost_time[key_nerToken] = 0
            cost_time[key_nerToken] += time.time()-start_time


            for sentence_index in range(len(sentences)):

                sentence_text = sentences[sentence_index]
                # tokens = tokens_all[sentence_index]
                #
                # #pos_tag = pos_all[sentence_index]
                # pos_tag = ""
                #
                # ner_entitys = ""

                list_sentences_temp.append(sentence_text)

        if len(list_sentences_temp)==0:
            list_sentences_temp.append(sentence_text)
        list_sentences.append(list_sentences_temp)
        print('2:',time.time()-a_time)
    return list_sentences

def ronghe():
    a = "，投诉处理决定书，投诉人：福建光正工程项目管理有限公司，联系地址：福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室，被投诉人：泉州台商投资区城市建设发展有限公司，泉州台商投资区水务投资经营有限公司，福建省富诚工程管理有限公司，联系地址：泉州台商投资区通港路大创商厦，一、投诉人投诉事项，投诉人按中标候选人公示的要求参加会议，由于提供的身份证原件于复印件版本不同而被废标，认为废标理由不成立。"
    ners = [(13, 28, 'company', '福建光正工程项目管理有限公司'), (33, 75, 'location', '福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室'), (80, 98, 'company', '泉州台商投资区城市建设发展有限公司'), (98, 116, 'company', '泉州台商投资区水务投资经营有限公司'), (116, 130, 'company', '福建省富诚工程管理有限公司'), (135, 150, 'location', '泉州台商投资区通港路大创商厦')]
    s = ['person', 'org', 'company', 'union']
    remove_num = 0
    for i in range(len(ners)):
        print(0)
        ner = ners[i]
        begin = ner[0]
        end = ner[1]
        type = ner[2]

        if type in s:
            if end == ners[i+1][0] and a[end-1]=='、':
                print(1)
                new_begin = begin
                new_end = ners[i+1][1]
                new_type = 'union'
                new_text = ner[3]+'、'+ners[i+1][3]
                new_ner = (new_begin,new_end,new_type,new_text)
                ners[i] = 0
                ners[i+1] = new_ner
                remove_num += 1
                continue
            if end == ners[i + 1][0] and a[end-1] == '，' and a[ners[i + 1][1]-1]==a[end-1]:
                print(2)
                new_begin = begin
                new_end = ners[i + 1][1]
                new_type = 'union'
                new_text = ner[3] + '，' + ners[i + 1][3]
                new_ner = (new_begin, new_end, new_type, new_text)
                ners[i] = 0
                ners[i + 1] = new_ner
                remove_num += 1

    for i in range(remove_num):
        ners.remove(0)
    print(ners)

if __name__ == '__main__':
    # get_data1()
    # get_ners()
    # test02()
    # get_unionNers()
    # 投诉人、被投诉/处罚人
    # get_complainant()
    # ronghe()
    # 分类
    # textClassify()
    # 投诉是否成立、处罚决定（投诉）
    # get_punishWhether01()
    # 处罚决定（处罚）
    # get_punishDecision()
    # 执法机构、处罚时间
    get_institution()
    # 处罚类型
    # get_punishType()

    pass