123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831 |
- import sys
- import os
- sys.path.append(os.path.abspath("../.."))
- import pandas as pd
- import re
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.interface.Entitys import *
- from BiddingKG.dl.interface.predictor import *
- from BiddingKG.dl.foolnltk import selffool
- from BiddingKG.dl.interface.Preprocessing import *
- def get_data1():
- load1 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_TOU_SU_CHU_LI.csv")
- load2 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_WEI_FA_JI_LU.csv")
- load3 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_QI_TA_SHI_XIN.csv")
- load = pd.concat([load1, load2, load3], axis=0)
- load = load.reset_index(drop=True)
- load['PAGE_CONTENT'] = get_article1(load['PAGE_CONTENT'])
- sentences_list = get_sentences1(load['PAGE_CONTENT'])
- load['sentences'] = ['*#*>'.join(_sentences) for _sentences in sentences_list ]
- load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv")
- def get_ners():
- data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
- # data = data.head(3)
- nersList = []
- for index,_sentences in zip(data.index,data['sentences']):
- _sentences = _sentences.split('*#*>')
- _ners = getNers(_sentences,useselffool=True)
- word_index = 0
- for ners,sentence in zip(_ners, _sentences):
- if len(ners) != 0:
- word_ner_list = ['O']*len(sentence)
- for ner in ners:
- nerDict = dict()
- entity_type = ner[2]
- nerDict['entity_type'] = entity_type
- entity_text = ner[3]
- nerDict['entity_text'] = entity_text
- begin_index = ner[0]
- nerDict['begin_index'] = begin_index
- end_index = ner[1] - 1
- nerDict['end_index'] = end_index
- wordOffset_begin = word_index + begin_index
- nerDict['wordOffset_begin'] = wordOffset_begin
- wordOffset_end = wordOffset_begin + len(entity_text)
- nerDict['wordOffset_end'] = wordOffset_end
- nerDict['sentence'] = sentence
- nerDict['article_index'] = index
- # print('====')
- # print(begin_index,end_index,entity_type,entity_text)
- nersList.append(nerDict)
- # print(nerDict)
- word_ner_list[begin_index] = 'B'
- word_ner_list[begin_index+1:end_index] = ['I']*(end_index-begin_index-1)
- word_index += len(sentence)
- # save(nersList,"nersList.pk")
- # 相邻的(org、company)(person)合并
- def get_unionNers():
- data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
- ners = load("nersList.pk")
- org_companys = [[] for _ in range(len(data))]
- type1 = ['org', 'company', 'union_oc']
- persons = [[] for _ in range(len(data))]
- type2 = ['person', 'union_person']
- for ner in ners:
- if ner['entity_type'] in type1:
- org_companys[ner['article_index']].append(ner)
- if ner['entity_type'] in type2:
- persons[ner['article_index']].append(ner)
- # 合并 org 和 company
- new_org_companys = []
- for org_company in org_companys:
- if org_company and len(org_company) > 1:
- union_nums = 0
- for i in range(len(org_company)-1):
- if org_company[i]['end_index'] == org_company[i + 1]['begin_index'] - 1 and org_company[i]['sentence'][org_company[i]['end_index']] == '、' \
- and org_company[i]['sentence'] == org_company[i + 1]['sentence']:
- # print(1)
- org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
- org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
- org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
- # print(org_company[i + 1]['entity_text'])
- org_company[i] = 0
- union_nums += 1
- elif org_company[i]['end_index'] == org_company[i + 1]['begin_index'] and org_company[i]['sentence'] == org_company[i+1]['sentence']:
- # print(2)
- org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
- org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
- org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
- # print(org_company[i + 1]['entity_text'])
- org_company[i] = 0
- union_nums += 1
- for _ in range(union_nums):
- org_company.remove(0)
- new_org_companys.append(org_company)
- # 合并person
- new_persons = []
- for person in persons:
- if person and len(person) > 1:
- union_nums = 0
- for i in range(len(person) - 1):
- if person[i]['end_index'] == person[i + 1]['begin_index'] - 1 and person[i]['sentence'][person[i]['end_index']] == '、' \
- and person[i]['sentence'] == person[i + 1]['sentence']:
- # print(1)
- person[i + 1]['begin_index'] = person[i]['begin_index']
- person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
- person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
- # print(person[i + 1]['entity_text'])
- person[i] = 0
- union_nums += 1
- elif person[i]['end_index'] == person[i + 1]['begin_index'] and person[i]['sentence'] == person[i + 1]['sentence']:
- # print(2)
- person[i + 1]['begin_index'] = person[i]['begin_index']
- person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
- person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
- # print(person[i + 1]['entity_text'])
- person[i] = 0
- union_nums += 1
- for _ in range(union_nums):
- person.remove(0)
- new_persons.append(person)
- # save([new_org_companys,new_persons],"unionNers.pk")
- def test02():
- load = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
- text_rule = re.compile("监管调查|通报|不诚信|监督检查|不良|投诉|质疑|处罚|违法|违规|不予[受处]理|处理")
- title_rule = re.compile("中标公告|中标[(\(]成交[\))]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
- "|补贴公[示告]|废标公[示告]")
- # need_index = []
- # for index, title, text in zip(load.index, load['PAGE_TITLE'], load['PAGE_CONTENT']):
- # a = 0
- # if text_rule.search(text):
- # a = 1
- # if title_rule.search(title):
- # a = 0
- # if text_rule.search(title):
- # a = 1
- # if a:
- # need_index.append(index)
- # print(len(need_index))
- # load = load.loc[need_index]
- # print(len(load))
- # load = load.reset_index(drop=True)
- complainants_rule1 = re.compile("[^被]投[诉拆][人方]之?[\d一二三四五六七八九十]?(?:(.+?))?[::]+?")
- complaint_rule = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|疑问[人方]|检举[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?名称)?[::]+")
- complainants_list = []
- a = 1
- load = load[9744:9745]
- for article,sentences in zip(load['PAGE_CONTENT'],load['sentences']):
- print(a)
- a+=1
- getSentences = sentences.split('*#*>')
- # print(getSentences)
- ners = getNers(getSentences,useselffool=True)
- print(ners)
- print('======================')
- word_index = 0
- ners_list = []
- for ner,sentence in zip(ners,getSentences):
- size = 16
- complainants = []
- if len(ner)!=0:
- for aner in ner:
- entity_type = aner[2]
- entity_text = aner[3]
- # begin = word_index + aner[0]
- # end = begin + len(entity_text)
- # 投诉人
- if entity_type in ['org','company','person']:
- left = sentence[max(0, aner[0] - size):aner[0]]
- print(entity_text,left,sentence)
- if complaint_rule.search(left):
- print('yes')
- entity_type = 'complainant'
- complainants.append(entity_text)
- # ners_list.append([begin, end, entity_type, entity_text])
- word_index += len(sentence)
- complainants_list.append(complainants)
- # test
- # for i in ners_list:
- # print(i[3])
- # print(processed[0][i[0]:i[1]])
- load['complainant'] = complainants_list
- # load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\test01.csv")
- # 投诉人、被投诉人、被处罚人
- def get_complainant():
- data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2.xlsx",index_col=0)
- # ners = load("nersList.pk")
- unionNers = load("unionNers.pk")
- ners = [i+j for i,j in zip(unionNers[0],unionNers[1])]
- complainants = [[] for _ in range(len(data))]
- punishPeople = [[] for _ in range(len(data))]
- a = ['org','company','person']
- size = 16
- # 投诉人、质疑人
- complainants_rule1 = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
- # 被处罚人,被投诉人
- punishPeople_rule1 = re.compile("(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
- punishPeople_rule2_1 = re.compile(",$")
- punishPeople_rule2_2 = re.compile("^[::]")
- punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
- punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
- time1 = time.time()
- for _ner in ners:
- if _ner:
- for ner in _ner:
- left = ner['sentence'][max(0,ner['begin_index']-size):ner['begin_index']]
- right = ner['sentence'][ner['end_index']:min(ner['end_index']+size,len(ner['sentence']))]
- # print(left)
- if complainants_rule1.search(left):
- complainants[ner['article_index']].append(ner['entity_text'])
- elif punishPeople_rule1.search(left):
- punishPeople[ner['article_index']].append(ner['entity_text'])
- elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
- if data['类别'][ner['article_index']] == '投诉处理':
- complainants[ner['article_index']].append(ner['entity_text'])
- else:
- punishPeople[ner['article_index']].append(ner['entity_text'])
- elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
- punishPeople[ner['article_index']].append(ner['entity_text'])
- data['complainant'] = complainants
- data['punishPeople'] = punishPeople
- print(time.time()-time1)
- data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx")
- def get_complainant2(list_sentences, list_entitys, text_type):
- '''
- list_sentences: get_preprocessed() 得list_sentences
- list_entitys: get_preprocessed() 得list_entitys
- text_type: 文章类别(处罚类型)
- :return:
- complainants :投诉人列表
- punishPeople: 被投诉人/被处罚人
- '''
- sentences_list = list_sentences
- entitys_list = list_entitys
- size = 16
- a = ['org', 'company', 'person']
- b = ['org', 'company', 'union_org_company']
- c = ['person', 'union_person']
- need_entitys = []
- for entity in entitys_list:
- if entity.entity_type in a:
- need_entitys.append(entity)
- # 实体合并
- drop_count = 0
- for i in range(1, len(need_entitys)):
- entity = need_entitys[i]
- entity_begin = entity.wordOffset_begin
- entity_end = entity.wordOffset_end
- sentence = sentences_list[entity.sentence_index].sentence_text
- last_entity = need_entitys[i - 1]
- if entity.sentence_index == last_entity.sentence_index:
- if (entity.entity_type in b and last_entity.entity_type in b) or (
- entity.entity_type in c and last_entity.entity_type in c):
- if entity_begin - last_entity.wordOffset_end < 2 and sentence[
- last_entity.wordOffset_end:entity_begin] in ['',
- '、',
- '和',
- '及']:
- need_entitys[i].wordOffset_begin = last_entity.wordOffset_begin
- need_entitys[i].begin_index = last_entity.begin_index
- need_entitys[i].entity_text = last_entity.entity_text + '+' + entity.entity_text
- if entity.entity_type in b:
- need_entitys[i].entity_type = 'union_org_company'
- else:
- need_entitys[i].entity_type = 'union_person'
- need_entitys[i - 1] = 0
- drop_count += 1
- for _ in range(drop_count):
- need_entitys.remove(0)
- # 投诉人、质疑人
- complainants_rule1 = re.compile(
- "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
- # 被处罚人,被投诉人
- punishPeople_rule1 = re.compile(
- "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
- punishPeople_rule2_1 = re.compile(",$")
- punishPeople_rule2_2 = re.compile("^[::]")
- punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
- punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
- complainants = []
- punishPeople = []
- for i in range(len(need_entitys)):
- entity = need_entitys[i]
- entity_begin = entity.wordOffset_begin
- entity_end = entity.wordOffset_end
- # entity所在句子
- sentence = sentences_list[entity.sentence_index].sentence_text
- left = sentence[max(0, entity_begin - size):entity_begin]
- right = sentence[entity_end:min(entity_end + size, len(sentence))]
- if complainants_rule1.search(left):
- complainants.append(entity)
- elif punishPeople_rule1.search(left):
- punishPeople.append(entity)
- elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
- if text_type == '投诉处理':
- complainants.append(entity)
- else:
- punishPeople.append(entity)
- elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
- punishPeople.append(entity)
- result_complainants = []
- result_punishPeople = []
- for entity in complainants:
- if entity.entity_type in ['union_org_company', 'union_person']:
- entity_text = entity.entity_text.split('+')
- for item in entity_text:
- result_complainants.append(item)
- else:
- result_complainants.append(entity.entity_text)
- for entity in punishPeople:
- if entity.entity_type in ['union_org_company', 'union_person']:
- entity_text = entity.entity_text.split('+')
- for item in entity_text:
- result_punishPeople.append(item)
- else:
- result_punishPeople.append(entity.entity_text)
- return list(set(result_complainants)), list(set(result_punishPeople))
- # 公告分类
- def textClassify():
- data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
- #投诉人|检举人|举报人|质疑人|质疑函
- patten1 = "投诉人|检举人|举报人|质疑人|质疑函|投诉处理|质疑单位"
- re1 = re.compile(patten1)
- patten2 = "不予[处受]理|撤诉|撤[销回]投诉|投诉终止"
- re2 = re.compile(patten2)
- patten3 = "关于[^,。]+?(?:处罚|通报|处理意见)|被处罚人|处罚决定|限制行为开始时间|处罚执行部门"
- re3 = re.compile(patten3)
- patten4 = "不良行为|不良信用|不良记录|不规范行为|不诚信行为"
- re4 = re.compile(patten4)
- patten5 = "行政处罚|行政处理|监督检查|监管调查|监督处理|违规处[罚理]|违法处[罚理]"
- re5 = re.compile(patten5)
- patten6 = "严重违法失信起名单|严重违法失信行为|严重违法失信企业"
- re6 = re.compile(patten6)
- patten7 = '处理决定'
- re7 = re.compile(patten7)
- patten8 = "处[理罚]依据|处罚日期|扣分依据|认定依据"
- re8 = re.compile(patten8)
- pos = []
- _type = []
- for title,text in zip(data['PAGE_TITLE'],data["PAGE_CONTENT"]):
- p = []
- t = ''
- if re1.search(text) or re1.search(title):
- p.append(patten1)
- t = '投诉'
- elif re2.search(text) and re.search('投诉',text):
- p.append('投诉+'+patten2)
- t = '投诉'
- elif re.search("回复",title):
- p.append("回复")
- t = '投诉'
- if len(p)==0:
- if re3.search(title) or re3.search(text):
- p.append(patten3)
- t = '处罚'
- elif re4.search(title):
- p.append(patten4)
- t = '处罚'
- elif re5.search(title) or re5.search(text):
- p.append(patten5)
- t = '处罚'
- elif re6.search(text) or re6.search(title):
- p.append(patten6)
- t = '处罚'
- elif re8.search(text):
- p.append(patten8)
- t = '处罚'
- if len(p) == 0:
- if re7.search(text) and re.search('投诉', text):
- p.append('投诉+' + patten7)
- t = '投诉'
- elif re7.search(text) or re7.search(title):
- p.append("处罚+"+patten7)
- t = '处罚'
- pos.append(p)
- _type.append(t)
- data['pos'] = pos
- data['type'] = _type
- data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv")
- # 投诉是否成立
- def get_punishWhether01():
- data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv",index_col=0)
- data = data[data['type']=='投诉']
- punishWhether_1 = re.compile("投诉[^。,,不]+?成立|投诉[^。,,]*[^不]属实|情况[^。,,]*[^不]属实|投诉成立|情况属实|予以支持")
- punishWhether_0 = re.compile("投诉[^。,,]*不能?成立|撤诉|[^逾将]{4,}不予[受处]理|撤[回销][^。,,]*(?:举报|投诉)|驳回[^。,,]*投诉|投诉终止|终止[^。,,]*投诉|情况[^。,,]*不属实|投诉[^。,,]*不属实|缺乏事实依据|不予支持|予以驳回")
- punishWhether = []
- punishDecision = []
- punishDecision_1 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[::]((?:(?:[\d一二三四五六七八九十]|[\((][\d一二三四五六七八九十][\))]|投[诉拆]事项[\d一二三四五六七八九十]).+?。)+)")
- punishDecision_2 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[::]([^。]+?(?:。|$))")
- punishDecision_3 = re.compile("[\d一二三四五六七八九十]、(?:处理,?意见|[裁决|处理]依据及结果|处理(?:决定|结果)|投诉处理决定),(.+?)。[\d一二三四五六七八九十]、")
- punishDecision_4 = re.compile("(?:[\d一二三四五六七八九十]、处理,?意见|综上所述|[裁决|处理]依据及结果|综上|[\d一二三四五六七八九十]、处理(?:决定|结果)|经研究决定|[\d一二三四五六七八九十]、投诉处理决定),([^。]+?(?:。|$))")
- punishDecision_5 = re.compile("(本机关决定|本机关认为|经审查.+?(?:。|$))")
- punishDecision_6 = re.compile("((?:依据|按照|根据|依照)[^::。].+?(?:。|$))")
- def findDecision(text):
- decision = ''
- if punishDecision_1.search(text):
- decision = punishDecision_1.search(text).group(1)
- elif punishDecision_2.search(text):
- decision = punishDecision_2.search(text).group(1)
- elif punishDecision_3.search(text):
- decision = punishDecision_3.search(text).group(1)
- elif punishDecision_4.search(text):
- decision = punishDecision_4.findall(text)
- decision = decision[-1]
- elif punishDecision_5.search(text):
- decision = punishDecision_5.search(text).group(1)
- elif punishDecision_6.search(text):
- decision = punishDecision_6.findall(text)
- decision1 = decision[-1]
- if re.search("诉讼",decision1) and len(decision)>1:
- decision1 = decision[-2]
- decision = decision1
- return decision
- for text in data['PAGE_CONTENT']:
- pw = ''
- if punishWhether_1.search(text):
- pw = 1
- elif punishWhether_0.search(text):
- pw = 0
- punishWhether.append(pw)
- mid = len(text)//2
- lower_half = text[mid:]
- decision = findDecision(lower_half)
- if decision == '':
- decision = findDecision(text)
- # if punishDecision_1.search(text):
- # decision = punishDecision_1.search(text).group(1)
- #
- # elif punishDecision_2.search(text):
- # decision = punishDecision_2.search(text).group(1)
- # elif punishDecision_3.search(text):
- # decision = punishDecision_3.search(text).group(1)
- # elif punishDecision_4.search(text):
- # decision = punishDecision_4.findall(text)
- # decision = decision[-1]
- # elif punishDecision_5.search(text):
- # decision = punishDecision_5.findall(text)
- # decision = decision[-1]
- punishDecision.append(decision)
- data['punishWhether'] = punishWhether
- data['punishDecision'] = punishDecision
- data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishWhether&Decision.csv")
- # 处罚决定
- def get_punishDecision():
- data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv", index_col=0)
- data = data[data['type'] == '处罚']
- punishDecision_1 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[::]+((?:(?:[\d一二三四五六七八九十]|[\((][\d一二三四五六七八九十][\))]).+?。)+)")
- punishDecision_2 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[::]+(.+?(?:。|$))")
- punishDecision_3 = re.compile("(扣分分?值[::][\d.]+分?)")
- punishDecision_4 = re.compile("[\d一二三四五六七八九十]、(?:处理结果|处理决定|处理依据[和及]处理结果|处理依据及结果|处罚决定|处罚结果|整改意见),(.+?)。[\d一二三四五六七八九十]、")
- punishDecision_5 = re.compile("(?:处理结果|[\d一二三四五六七八九十]、处理决定|处理依据及处理结果|处理依据及结果|经研究|经研究决定|[\d一二三四五六七八九十]、处罚决定|处罚结果|整改意见),+(.+?(?:。|$))")
- punishDecision_6 = re.compile("(?:本机关决定|我局决定)(.+?(?:。|$))")
- punishDecision_7 = re.compile("((?:依据|按照|根据|依照)[^::。].+?(?:。|$))")
- punishDecision = []
- for text in data['PAGE_CONTENT']:
- decision = ''
- if punishDecision_1.search(text):
- decision = punishDecision_1.search(text).group(1)
- elif punishDecision_2.search(text):
- decision = punishDecision_2.search(text).group(1)
- elif punishDecision_3.search(text):
- decision = punishDecision_3.search(text).group(1)
- elif punishDecision_4.search(text):
- decision = punishDecision_4.search(text).group(1)
- elif punishDecision_5.search(text):
- decision = punishDecision_5.findall(text)
- decision = decision[-1]
- elif punishDecision_6.search(text):
- decision = punishDecision_6.search(text).group(1)
- elif punishDecision_7.search(text):
- decision = punishDecision_7.findall(text)
- decision = decision[-1]
- punishDecision.append(decision)
- data['punishDecision'] = punishDecision
- data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishDecision处罚.csv")
- # 执法机构、处罚时间
- def get_institution():
- data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx", index_col=0)
- ners = load("nersList.pk")
- orgs = [[] for _ in range(len(data))]
- times = [[] for _ in range(len(data))]
- institutions = [[] for _ in range(len(data))]
- punishTimes = [[] for _ in range(len(data))]
- institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
- punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
- for ner in ners:
- if ner['entity_type'] == 'org':
- left = ner['sentence'][max(0,ner['begin_index']-15):ner['begin_index']]
- if institution_1.search(left):
- institutions[ner['article_index']].append(ner['entity_text'])
- orgs[ner['article_index']].append(ner)
- elif ner['entity_type'] =='time':
- left = ner['sentence'][max(0, ner['begin_index'] - 15):ner['begin_index']]
- if punishTimes_1.search(left):
- punishTimes[ner['article_index']].append(ner['entity_text'])
- times[ner['article_index']].append(ner)
- orgs = [org[-5:] if len(org)>5 else org for org in orgs]
- times = [time[-3:] if len(time)>3 else time for time in times]
- data['org'] = orgs
- data['time'] = times
- data['institution'] = institutions
- data['punishTime'] = punishTimes
- # data = data[data['type'].isin(["投诉","处罚"])]
- print(len(data))
- # data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv")
- # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv", index_col=0)
- institution_list = []
- punishTime_list = []
- institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
- institution_time = re.compile("(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
- for title,text,org,n_time,institution,punishTime in zip(data['PAGE_TITLE'],data['PAGE_CONTENT'],data['org'],data['time'],data['institution'],data['punishTime']):
- ins = ''
- ptime = ''
- if punishTime:
- ptime = punishTime
- if institution:
- ins = institution
- else:
- title_ners = getNers([title],useselffool=True)
- if title_ners[0]:
- for title_ner in title_ners[0]:
- if title_ner[2]=='org' and institution_title.search(title_ner[3]):
- # 'title:'+
- ins = title_ner[3]
- # print(title_ner[3])
- break
- # if ins == '':
- for _org in org[::-1]:
- right = _org['sentence'][_org['end_index']:min(len(_org['sentence']),_org['end_index']+16)]
- if institution_time.search(right):
- if ins == '':
- # "text_EndWithTime:" +
- ins = _org['entity_text']
- if ptime == '':
- # "text_EndWithIns:" +
- ptime =institution_time.search(right).group(1)
- break
- if ptime == '' and len(n_time) != 0:
- textLong = len(text)
- if n_time[-1]['wordOffset_end'] > textLong-3 and len(n_time[-1]['entity_text'])>3:
- # "EndOfText:" +
- ptime = n_time[-1]['entity_text']
- institution_list.append(ins)
- punishTime_list.append(ptime)
- data['institution'] = institution_list
- data['punishTime'] = punishTime_list
- data = data.drop(columns=['org','time'],axis=1)
- data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-2.xlsx")
- # 处罚类型
- def get_punishType():
- data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
- # 暂定:严重违法失信,行政处罚,投诉处理,监督检查,其他失信记录
- # 其他无关公告
- title_rule = re.compile("(?:中标公[示告]|中标[(\(]成交[\))]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
- "|补贴公[示告]|废标公[示告]|备案公[示告]|数据统计|选取公告|流标公告|变更公告|入围公告|征集公告|执行情况|"
- "登记公告|竞争性磋商公告|报名的公[示告]|竞争性谈判公告|邀请函|竞标公告|采购公告|招标公告|议标公告|预审公告|"
- "询价公告|竞争性磋商(磋商)公告|竞[谈价]公告|合同公告|人员(名单)?公示|批复|终止公告|入围结果公告|中标结果公[示告]|"
- "意见公示)(?:[\((].+?[\))])?$|关于.*通知(?:[^书]|$)")
- othertype = "其他无关公告"
- # 投诉处理
- re1_1 = re.compile("投诉[人方]|检举人|举报人[::]|投诉处理|终止投诉|投诉终止|撤诉|撤回投诉|质疑人|质疑单位|质疑[^,,。]*答复")
- re1_2 = re.compile("处理决定|回复")
- re1_type = '投诉处理'
- # 监督检查
- re2 = re.compile("监督检查|监管调查|监督处理")
- re2_type = "监督检查"
- # 行政处罚
- re3 = re.compile("行政处罚|行政处理")
- re3_type = "行政处罚"
- # 严重违法失信
- re4 = re.compile("严重违法失信行为|严重违法失信企业|严重违法失信起名单")
- re4_type = "严重违法失信"
- # 其他失信公告
- re_other = re.compile("关于[^,。]+?(?:处罚|处理|通报)|不良行为|不良信用|不良记录|不规范行为|不诚信行为|"
- "违[规法约]处[罚理]|处[理罚]依据|处罚日期|扣分依据|认定依据|处罚决定|违规情况|"
- "违[规法]行为|违规事项|考评依据|失信行为")
- re_otherType = "其他失信公告"
- punishType_list = []
- for title,text in zip(data['PAGE_TITLE'],data['PAGE_CONTENT']):
- punishType = ''
- titleWithText = title + text
- if title_rule.search(title):
- punishType = othertype
- elif re1_1.search(titleWithText) or re.search("投[诉拆]",title):
- punishType = re1_type
- elif re1_2.search(titleWithText) and re.search("投诉",titleWithText):
- punishType = re1_type
- elif re2.search(titleWithText):
- punishType = re2_type
- elif re3.search(titleWithText):
- punishType = re3_type
- elif re4.search(titleWithText):
- punishType = re4_type
- elif re_other.search(titleWithText) or re.search("处罚",title):
- punishType = re_otherType
- punishType_list.append(punishType)
- data['punishType'] = punishType_list
- data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishType_test.csv",encoding='utf-8')
- def getNers_my(sentences,MAXAREA = 10000,useselffool=False):
- '''
- @param: sentences:句子数
- @return 限流执行后的实体识别list
- '''
- def getData(ners,process_data):
- process_sentences = [item[1] for item in process_data]
- print(process_data)
- if useselffool:
- ner_ = selffool.self_ner(process_sentences)
- else:
- ner_ = selffool.ner(process_sentences)
- print('ner_ :',ner_)
- for i in range(len(ner_)):
- the_index = process_data[i][0]
- ners[the_index] = ner_[i]
- sents = []
- for i in range(len(sentences)):
- sents.append([i,sentences[i]])
- sents.sort(key=lambda x:len(x[1]),reverse=True)
- print(sents)
- index_ = 0
- ners = [[]for i in range(len(sentences))]
- while(True):
- width = len(sents[index_][1])
- height = MAXAREA//width+1
- if height>len(sents)-index_:
- height = len(sents)-index_
- process_data = sents[index_:index_+height]
- getData( ners, process_data)
- index_ += height
- if index_>=len(sents):
- break
- return ners
- # 网页公告处理
- def get_article1(articles,cost_time = dict(),useselffool=True):
- '''
- :param articles: 待处理的article source html
- :param useselffool: 是否使用selffool
- :return: list_articles
- '''
- list_articles = []
- for article in articles:
- a_time = time.time()
- sourceContent = article
- #表格处理
- key_preprocess = "tableToText"
- start_time = time.time()
- article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
- # log(article_processed)
- if key_preprocess not in cost_time:
- cost_time[key_preprocess] = 0
- cost_time[key_preprocess] += time.time()-start_time
- #article_processed = article[1]
- list_articles.append(article_processed)
- print(time.time()-a_time)
- return list_articles
- # 分句处理
- def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
- '''
- :param list_articles: 经过预处理的article text
- :return: list_sentences
- '''
- list_sentences = []
- for article in list_articles:
- a_time = time.time()
- list_sentences_temp = []
- #表格处理
- key_preprocess = "tableToText"
- start_time = time.time()
- article_processed = article
- if key_preprocess not in cost_time:
- cost_time[key_preprocess] = 0
- cost_time[key_preprocess] += time.time()-start_time
- #nlp处理
- if article_processed is not None and len(article_processed)!=0:
- split_patten = "。"
- sentences = []
- _begin = 0
- sentences_set = set()
- for _iter in re.finditer(split_patten,article_processed):
- _sen = article_processed[_begin:_iter.span()[1]]
- if len(_sen)>0 and _sen not in sentences_set:
- sentences.append(_sen)
- sentences_set.add(_sen)
- _begin = _iter.span()[1]
- _sen = article_processed[_begin:]
- if len(_sen)>0 and _sen not in sentences_set:
- sentences.append(_sen)
- sentences_set.add(_sen)
- # article = "".join(sentences)
- # # sentences.append(article_processed[_begin:])
- #
- # lemmas = []
- # doc_offsets = []
- # dep_types = []
- # dep_tokens = []
- #
- # time1 = time.time()
- '''
- tokens_all = fool.cut(sentences)
- #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
- #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
- ner_entitys_all = fool.ner(sentences)
- '''
- #限流执行
- key_nerToken = "nerToken"
- start_time = time.time()
- # tokens_all = getTokens(sentences,useselffool=useselffool)
- if key_nerToken not in cost_time:
- cost_time[key_nerToken] = 0
- cost_time[key_nerToken] += time.time()-start_time
- for sentence_index in range(len(sentences)):
- sentence_text = sentences[sentence_index]
- # tokens = tokens_all[sentence_index]
- #
- # #pos_tag = pos_all[sentence_index]
- # pos_tag = ""
- #
- # ner_entitys = ""
- list_sentences_temp.append(sentence_text)
- if len(list_sentences_temp)==0:
- list_sentences_temp.append(sentence_text)
- list_sentences.append(list_sentences_temp)
- print('2:',time.time()-a_time)
- return list_sentences
- def ronghe():
- a = ",投诉处理决定书,投诉人:福建光正工程项目管理有限公司,联系地址:福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室,被投诉人:泉州台商投资区城市建设发展有限公司,泉州台商投资区水务投资经营有限公司,福建省富诚工程管理有限公司,联系地址:泉州台商投资区通港路大创商厦,一、投诉人投诉事项,投诉人按中标候选人公示的要求参加会议,由于提供的身份证原件于复印件版本不同而被废标,认为废标理由不成立。"
- ners = [(13, 28, 'company', '福建光正工程项目管理有限公司'), (33, 75, 'location', '福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室'), (80, 98, 'company', '泉州台商投资区城市建设发展有限公司'), (98, 116, 'company', '泉州台商投资区水务投资经营有限公司'), (116, 130, 'company', '福建省富诚工程管理有限公司'), (135, 150, 'location', '泉州台商投资区通港路大创商厦')]
- s = ['person', 'org', 'company', 'union']
- remove_num = 0
- for i in range(len(ners)):
- print(0)
- ner = ners[i]
- begin = ner[0]
- end = ner[1]
- type = ner[2]
- if type in s:
- if end == ners[i+1][0] and a[end-1]=='、':
- print(1)
- new_begin = begin
- new_end = ners[i+1][1]
- new_type = 'union'
- new_text = ner[3]+'、'+ners[i+1][3]
- new_ner = (new_begin,new_end,new_type,new_text)
- ners[i] = 0
- ners[i+1] = new_ner
- remove_num += 1
- continue
- if end == ners[i + 1][0] and a[end-1] == ',' and a[ners[i + 1][1]-1]==a[end-1]:
- print(2)
- new_begin = begin
- new_end = ners[i + 1][1]
- new_type = 'union'
- new_text = ner[3] + ',' + ners[i + 1][3]
- new_ner = (new_begin, new_end, new_type, new_text)
- ners[i] = 0
- ners[i + 1] = new_ner
- remove_num += 1
- for i in range(remove_num):
- ners.remove(0)
- print(ners)
- if __name__ == '__main__':
- # get_data1()
- # get_ners()
- # test02()
- # get_unionNers()
- # 投诉人、被投诉/处罚人
- # get_complainant()
- # ronghe()
- # 分类
- # textClassify()
- # 投诉是否成立、处罚决定(投诉)
- # get_punishWhether01()
- # 处罚决定(处罚)
- # get_punishDecision()
- # 执法机构、处罚时间
- get_institution()
- # 处罚类型
- # get_punishType()
- pass
|