import sys import os sys.path.append(os.path.abspath("../..")) import pandas as pd import re from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.Entitys import * from BiddingKG.dl.interface.predictor import * from BiddingKG.dl.foolnltk import selffool from BiddingKG.dl.interface.Preprocessing import * def get_data1(): load1 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_TOU_SU_CHU_LI.csv") load2 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_WEI_FA_JI_LU.csv") load3 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_QI_TA_SHI_XIN.csv") load = pd.concat([load1, load2, load3], axis=0) load = load.reset_index(drop=True) load['PAGE_CONTENT'] = get_article1(load['PAGE_CONTENT']) sentences_list = get_sentences1(load['PAGE_CONTENT']) load['sentences'] = ['*#*>'.join(_sentences) for _sentences in sentences_list ] load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv") def get_ners(): data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0) # data = data.head(3) nersList = [] for index,_sentences in zip(data.index,data['sentences']): _sentences = _sentences.split('*#*>') _ners = getNers(_sentences,useselffool=True) word_index = 0 for ners,sentence in zip(_ners, _sentences): if len(ners) != 0: word_ner_list = ['O']*len(sentence) for ner in ners: nerDict = dict() entity_type = ner[2] nerDict['entity_type'] = entity_type entity_text = ner[3] nerDict['entity_text'] = entity_text begin_index = ner[0] nerDict['begin_index'] = begin_index end_index = ner[1] - 1 nerDict['end_index'] = end_index wordOffset_begin = word_index + begin_index nerDict['wordOffset_begin'] = wordOffset_begin wordOffset_end = wordOffset_begin + len(entity_text) nerDict['wordOffset_end'] = wordOffset_end nerDict['sentence'] = sentence nerDict['article_index'] = index # print('====') # print(begin_index,end_index,entity_type,entity_text) nersList.append(nerDict) # print(nerDict) word_ner_list[begin_index] = 'B' word_ner_list[begin_index+1:end_index] = ['I']*(end_index-begin_index-1) word_index += len(sentence) # save(nersList,"nersList.pk") # 相邻的(org、company)(person)合并 def get_unionNers(): data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0) ners = load("nersList.pk") org_companys = [[] for _ in range(len(data))] type1 = ['org', 'company', 'union_oc'] persons = [[] for _ in range(len(data))] type2 = ['person', 'union_person'] for ner in ners: if ner['entity_type'] in type1: org_companys[ner['article_index']].append(ner) if ner['entity_type'] in type2: persons[ner['article_index']].append(ner) # 合并 org 和 company new_org_companys = [] for org_company in org_companys: if org_company and len(org_company) > 1: union_nums = 0 for i in range(len(org_company)-1): if org_company[i]['end_index'] == org_company[i + 1]['begin_index'] - 1 and org_company[i]['sentence'][org_company[i]['end_index']] == '、' \ and org_company[i]['sentence'] == org_company[i + 1]['sentence']: # print(1) org_company[i + 1]['begin_index'] = org_company[i]['begin_index'] org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin'] org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text'] # print(org_company[i + 1]['entity_text']) org_company[i] = 0 union_nums += 1 elif org_company[i]['end_index'] == org_company[i + 1]['begin_index'] and org_company[i]['sentence'] == org_company[i+1]['sentence']: # print(2) org_company[i + 1]['begin_index'] = org_company[i]['begin_index'] org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin'] org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text'] # print(org_company[i + 1]['entity_text']) org_company[i] = 0 union_nums += 1 for _ in range(union_nums): org_company.remove(0) new_org_companys.append(org_company) # 合并person new_persons = [] for person in persons: if person and len(person) > 1: union_nums = 0 for i in range(len(person) - 1): if person[i]['end_index'] == person[i + 1]['begin_index'] - 1 and person[i]['sentence'][person[i]['end_index']] == '、' \ and person[i]['sentence'] == person[i + 1]['sentence']: # print(1) person[i + 1]['begin_index'] = person[i]['begin_index'] person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin'] person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text'] # print(person[i + 1]['entity_text']) person[i] = 0 union_nums += 1 elif person[i]['end_index'] == person[i + 1]['begin_index'] and person[i]['sentence'] == person[i + 1]['sentence']: # print(2) person[i + 1]['begin_index'] = person[i]['begin_index'] person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin'] person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text'] # print(person[i + 1]['entity_text']) person[i] = 0 union_nums += 1 for _ in range(union_nums): person.remove(0) new_persons.append(person) # save([new_org_companys,new_persons],"unionNers.pk") def test02(): load = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0) text_rule = re.compile("监管调查|通报|不诚信|监督检查|不良|投诉|质疑|处罚|违法|违规|不予[受处]理|处理") title_rule = re.compile("中标公告|中标[(\(]成交[\))]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]" "|补贴公[示告]|废标公[示告]") # need_index = [] # for index, title, text in zip(load.index, load['PAGE_TITLE'], load['PAGE_CONTENT']): # a = 0 # if text_rule.search(text): # a = 1 # if title_rule.search(title): # a = 0 # if text_rule.search(title): # a = 1 # if a: # need_index.append(index) # print(len(need_index)) # load = load.loc[need_index] # print(len(load)) # load = load.reset_index(drop=True) complainants_rule1 = re.compile("[^被]投[诉拆][人方]之?[\d一二三四五六七八九十]?(?:(.+?))?[::]+?") complaint_rule = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|疑问[人方]|检举[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?名称)?[::]+") complainants_list = [] a = 1 load = load[9744:9745] for article,sentences in zip(load['PAGE_CONTENT'],load['sentences']): print(a) a+=1 getSentences = sentences.split('*#*>') # print(getSentences) ners = getNers(getSentences,useselffool=True) print(ners) print('======================') word_index = 0 ners_list = [] for ner,sentence in zip(ners,getSentences): size = 16 complainants = [] if len(ner)!=0: for aner in ner: entity_type = aner[2] entity_text = aner[3] # begin = word_index + aner[0] # end = begin + len(entity_text) # 投诉人 if entity_type in ['org','company','person']: left = sentence[max(0, aner[0] - size):aner[0]] print(entity_text,left,sentence) if complaint_rule.search(left): print('yes') entity_type = 'complainant' complainants.append(entity_text) # ners_list.append([begin, end, entity_type, entity_text]) word_index += len(sentence) complainants_list.append(complainants) # test # for i in ners_list: # print(i[3]) # print(processed[0][i[0]:i[1]]) load['complainant'] = complainants_list # load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\test01.csv") # 投诉人、被投诉人、被处罚人 def get_complainant(): data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2.xlsx",index_col=0) # ners = load("nersList.pk") unionNers = load("unionNers.pk") ners = [i+j for i,j in zip(unionNers[0],unionNers[1])] complainants = [[] for _ in range(len(data))] punishPeople = [[] for _ in range(len(data))] a = ['org','company','person'] size = 16 # 投诉人、质疑人 complainants_rule1 = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)") # 被处罚人,被投诉人 punishPeople_rule1 = re.compile("(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)") punishPeople_rule2_1 = re.compile(",$") punishPeople_rule2_2 = re.compile("^[::]") punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$") punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)") time1 = time.time() for _ner in ners: if _ner: for ner in _ner: left = ner['sentence'][max(0,ner['begin_index']-size):ner['begin_index']] right = ner['sentence'][ner['end_index']:min(ner['end_index']+size,len(ner['sentence']))] # print(left) if complainants_rule1.search(left): complainants[ner['article_index']].append(ner['entity_text']) elif punishPeople_rule1.search(left): punishPeople[ner['article_index']].append(ner['entity_text']) elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right): if data['类别'][ner['article_index']] == '投诉处理': complainants[ner['article_index']].append(ner['entity_text']) else: punishPeople[ner['article_index']].append(ner['entity_text']) elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right): punishPeople[ner['article_index']].append(ner['entity_text']) data['complainant'] = complainants data['punishPeople'] = punishPeople print(time.time()-time1) data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx") def get_complainant2(list_sentences, list_entitys, text_type): ''' list_sentences: get_preprocessed() 得list_sentences list_entitys: get_preprocessed() 得list_entitys text_type: 文章类别(处罚类型) :return: complainants :投诉人列表 punishPeople: 被投诉人/被处罚人 ''' sentences_list = list_sentences entitys_list = list_entitys size = 16 a = ['org', 'company', 'person'] b = ['org', 'company', 'union_org_company'] c = ['person', 'union_person'] need_entitys = [] for entity in entitys_list: if entity.entity_type in a: need_entitys.append(entity) # 实体合并 drop_count = 0 for i in range(1, len(need_entitys)): entity = need_entitys[i] entity_begin = entity.wordOffset_begin entity_end = entity.wordOffset_end sentence = sentences_list[entity.sentence_index].sentence_text last_entity = need_entitys[i - 1] if entity.sentence_index == last_entity.sentence_index: if (entity.entity_type in b and last_entity.entity_type in b) or ( entity.entity_type in c and last_entity.entity_type in c): if entity_begin - last_entity.wordOffset_end < 2 and sentence[ last_entity.wordOffset_end:entity_begin] in ['', '、', '和', '及']: need_entitys[i].wordOffset_begin = last_entity.wordOffset_begin need_entitys[i].begin_index = last_entity.begin_index need_entitys[i].entity_text = last_entity.entity_text + '+' + entity.entity_text if entity.entity_type in b: need_entitys[i].entity_type = 'union_org_company' else: need_entitys[i].entity_type = 'union_person' need_entitys[i - 1] = 0 drop_count += 1 for _ in range(drop_count): need_entitys.remove(0) # 投诉人、质疑人 complainants_rule1 = re.compile( "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)") # 被处罚人,被投诉人 punishPeople_rule1 = re.compile( "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)") punishPeople_rule2_1 = re.compile(",$") punishPeople_rule2_2 = re.compile("^[::]") punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$") punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)") complainants = [] punishPeople = [] for i in range(len(need_entitys)): entity = need_entitys[i] entity_begin = entity.wordOffset_begin entity_end = entity.wordOffset_end # entity所在句子 sentence = sentences_list[entity.sentence_index].sentence_text left = sentence[max(0, entity_begin - size):entity_begin] right = sentence[entity_end:min(entity_end + size, len(sentence))] if complainants_rule1.search(left): complainants.append(entity) elif punishPeople_rule1.search(left): punishPeople.append(entity) elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right): if text_type == '投诉处理': complainants.append(entity) else: punishPeople.append(entity) elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right): punishPeople.append(entity) result_complainants = [] result_punishPeople = [] for entity in complainants: if entity.entity_type in ['union_org_company', 'union_person']: entity_text = entity.entity_text.split('+') for item in entity_text: result_complainants.append(item) else: result_complainants.append(entity.entity_text) for entity in punishPeople: if entity.entity_type in ['union_org_company', 'union_person']: entity_text = entity.entity_text.split('+') for item in entity_text: result_punishPeople.append(item) else: result_punishPeople.append(entity.entity_text) return list(set(result_complainants)), list(set(result_punishPeople)) # 公告分类 def textClassify(): data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0) #投诉人|检举人|举报人|质疑人|质疑函 patten1 = "投诉人|检举人|举报人|质疑人|质疑函|投诉处理|质疑单位" re1 = re.compile(patten1) patten2 = "不予[处受]理|撤诉|撤[销回]投诉|投诉终止" re2 = re.compile(patten2) patten3 = "关于[^,。]+?(?:处罚|通报|处理意见)|被处罚人|处罚决定|限制行为开始时间|处罚执行部门" re3 = re.compile(patten3) patten4 = "不良行为|不良信用|不良记录|不规范行为|不诚信行为" re4 = re.compile(patten4) patten5 = "行政处罚|行政处理|监督检查|监管调查|监督处理|违规处[罚理]|违法处[罚理]" re5 = re.compile(patten5) patten6 = "严重违法失信起名单|严重违法失信行为|严重违法失信企业" re6 = re.compile(patten6) patten7 = '处理决定' re7 = re.compile(patten7) patten8 = "处[理罚]依据|处罚日期|扣分依据|认定依据" re8 = re.compile(patten8) pos = [] _type = [] for title,text in zip(data['PAGE_TITLE'],data["PAGE_CONTENT"]): p = [] t = '' if re1.search(text) or re1.search(title): p.append(patten1) t = '投诉' elif re2.search(text) and re.search('投诉',text): p.append('投诉+'+patten2) t = '投诉' elif re.search("回复",title): p.append("回复") t = '投诉' if len(p)==0: if re3.search(title) or re3.search(text): p.append(patten3) t = '处罚' elif re4.search(title): p.append(patten4) t = '处罚' elif re5.search(title) or re5.search(text): p.append(patten5) t = '处罚' elif re6.search(text) or re6.search(title): p.append(patten6) t = '处罚' elif re8.search(text): p.append(patten8) t = '处罚' if len(p) == 0: if re7.search(text) and re.search('投诉', text): p.append('投诉+' + patten7) t = '投诉' elif re7.search(text) or re7.search(title): p.append("处罚+"+patten7) t = '处罚' pos.append(p) _type.append(t) data['pos'] = pos data['type'] = _type data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv") # 投诉是否成立 def get_punishWhether01(): data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv",index_col=0) data = data[data['type']=='投诉'] punishWhether_1 = re.compile("投诉[^。,,不]+?成立|投诉[^。,,]*[^不]属实|情况[^。,,]*[^不]属实|投诉成立|情况属实|予以支持") punishWhether_0 = re.compile("投诉[^。,,]*不能?成立|撤诉|[^逾将]{4,}不予[受处]理|撤[回销][^。,,]*(?:举报|投诉)|驳回[^。,,]*投诉|投诉终止|终止[^。,,]*投诉|情况[^。,,]*不属实|投诉[^。,,]*不属实|缺乏事实依据|不予支持|予以驳回") punishWhether = [] punishDecision = [] punishDecision_1 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[::]((?:(?:[\d一二三四五六七八九十]|[\((][\d一二三四五六七八九十][\))]|投[诉拆]事项[\d一二三四五六七八九十]).+?。)+)") punishDecision_2 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[::]([^。]+?(?:。|$))") punishDecision_3 = re.compile("[\d一二三四五六七八九十]、(?:处理,?意见|[裁决|处理]依据及结果|处理(?:决定|结果)|投诉处理决定),(.+?)。[\d一二三四五六七八九十]、") punishDecision_4 = re.compile("(?:[\d一二三四五六七八九十]、处理,?意见|综上所述|[裁决|处理]依据及结果|综上|[\d一二三四五六七八九十]、处理(?:决定|结果)|经研究决定|[\d一二三四五六七八九十]、投诉处理决定),([^。]+?(?:。|$))") punishDecision_5 = re.compile("(本机关决定|本机关认为|经审查.+?(?:。|$))") punishDecision_6 = re.compile("((?:依据|按照|根据|依照)[^::。].+?(?:。|$))") def findDecision(text): decision = '' if punishDecision_1.search(text): decision = punishDecision_1.search(text).group(1) elif punishDecision_2.search(text): decision = punishDecision_2.search(text).group(1) elif punishDecision_3.search(text): decision = punishDecision_3.search(text).group(1) elif punishDecision_4.search(text): decision = punishDecision_4.findall(text) decision = decision[-1] elif punishDecision_5.search(text): decision = punishDecision_5.search(text).group(1) elif punishDecision_6.search(text): decision = punishDecision_6.findall(text) decision1 = decision[-1] if re.search("诉讼",decision1) and len(decision)>1: decision1 = decision[-2] decision = decision1 return decision for text in data['PAGE_CONTENT']: pw = '' if punishWhether_1.search(text): pw = 1 elif punishWhether_0.search(text): pw = 0 punishWhether.append(pw) mid = len(text)//2 lower_half = text[mid:] decision = findDecision(lower_half) if decision == '': decision = findDecision(text) # if punishDecision_1.search(text): # decision = punishDecision_1.search(text).group(1) # # elif punishDecision_2.search(text): # decision = punishDecision_2.search(text).group(1) # elif punishDecision_3.search(text): # decision = punishDecision_3.search(text).group(1) # elif punishDecision_4.search(text): # decision = punishDecision_4.findall(text) # decision = decision[-1] # elif punishDecision_5.search(text): # decision = punishDecision_5.findall(text) # decision = decision[-1] punishDecision.append(decision) data['punishWhether'] = punishWhether data['punishDecision'] = punishDecision data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishWhether&Decision.csv") # 处罚决定 def get_punishDecision(): data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv", index_col=0) data = data[data['type'] == '处罚'] punishDecision_1 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[::]+((?:(?:[\d一二三四五六七八九十]|[\((][\d一二三四五六七八九十][\))]).+?。)+)") punishDecision_2 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[::]+(.+?(?:。|$))") punishDecision_3 = re.compile("(扣分分?值[::][\d.]+分?)") punishDecision_4 = re.compile("[\d一二三四五六七八九十]、(?:处理结果|处理决定|处理依据[和及]处理结果|处理依据及结果|处罚决定|处罚结果|整改意见),(.+?)。[\d一二三四五六七八九十]、") punishDecision_5 = re.compile("(?:处理结果|[\d一二三四五六七八九十]、处理决定|处理依据及处理结果|处理依据及结果|经研究|经研究决定|[\d一二三四五六七八九十]、处罚决定|处罚结果|整改意见),+(.+?(?:。|$))") punishDecision_6 = re.compile("(?:本机关决定|我局决定)(.+?(?:。|$))") punishDecision_7 = re.compile("((?:依据|按照|根据|依照)[^::。].+?(?:。|$))") punishDecision = [] for text in data['PAGE_CONTENT']: decision = '' if punishDecision_1.search(text): decision = punishDecision_1.search(text).group(1) elif punishDecision_2.search(text): decision = punishDecision_2.search(text).group(1) elif punishDecision_3.search(text): decision = punishDecision_3.search(text).group(1) elif punishDecision_4.search(text): decision = punishDecision_4.search(text).group(1) elif punishDecision_5.search(text): decision = punishDecision_5.findall(text) decision = decision[-1] elif punishDecision_6.search(text): decision = punishDecision_6.search(text).group(1) elif punishDecision_7.search(text): decision = punishDecision_7.findall(text) decision = decision[-1] punishDecision.append(decision) data['punishDecision'] = punishDecision data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishDecision处罚.csv") # 执法机构、处罚时间 def get_institution(): data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx", index_col=0) ners = load("nersList.pk") orgs = [[] for _ in range(len(data))] times = [[] for _ in range(len(data))] institutions = [[] for _ in range(len(data))] punishTimes = [[] for _ in range(len(data))] institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]") punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]") for ner in ners: if ner['entity_type'] == 'org': left = ner['sentence'][max(0,ner['begin_index']-15):ner['begin_index']] if institution_1.search(left): institutions[ner['article_index']].append(ner['entity_text']) orgs[ner['article_index']].append(ner) elif ner['entity_type'] =='time': left = ner['sentence'][max(0, ner['begin_index'] - 15):ner['begin_index']] if punishTimes_1.search(left): punishTimes[ner['article_index']].append(ner['entity_text']) times[ner['article_index']].append(ner) orgs = [org[-5:] if len(org)>5 else org for org in orgs] times = [time[-3:] if len(time)>3 else time for time in times] data['org'] = orgs data['time'] = times data['institution'] = institutions data['punishTime'] = punishTimes # data = data[data['type'].isin(["投诉","处罚"])] print(len(data)) # data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv") # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv", index_col=0) institution_list = [] punishTime_list = [] institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会") institution_time = re.compile("(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)") for title,text,org,n_time,institution,punishTime in zip(data['PAGE_TITLE'],data['PAGE_CONTENT'],data['org'],data['time'],data['institution'],data['punishTime']): ins = '' ptime = '' if punishTime: ptime = punishTime if institution: ins = institution else: title_ners = getNers([title],useselffool=True) if title_ners[0]: for title_ner in title_ners[0]: if title_ner[2]=='org' and institution_title.search(title_ner[3]): # 'title:'+ ins = title_ner[3] # print(title_ner[3]) break # if ins == '': for _org in org[::-1]: right = _org['sentence'][_org['end_index']:min(len(_org['sentence']),_org['end_index']+16)] if institution_time.search(right): if ins == '': # "text_EndWithTime:" + ins = _org['entity_text'] if ptime == '': # "text_EndWithIns:" + ptime =institution_time.search(right).group(1) break if ptime == '' and len(n_time) != 0: textLong = len(text) if n_time[-1]['wordOffset_end'] > textLong-3 and len(n_time[-1]['entity_text'])>3: # "EndOfText:" + ptime = n_time[-1]['entity_text'] institution_list.append(ins) punishTime_list.append(ptime) data['institution'] = institution_list data['punishTime'] = punishTime_list data = data.drop(columns=['org','time'],axis=1) data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-2.xlsx") # 处罚类型 def get_punishType(): data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0) # 暂定:严重违法失信,行政处罚,投诉处理,监督检查,其他失信记录 # 其他无关公告 title_rule = re.compile("(?:中标公[示告]|中标[(\(]成交[\))]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]" "|补贴公[示告]|废标公[示告]|备案公[示告]|数据统计|选取公告|流标公告|变更公告|入围公告|征集公告|执行情况|" "登记公告|竞争性磋商公告|报名的公[示告]|竞争性谈判公告|邀请函|竞标公告|采购公告|招标公告|议标公告|预审公告|" "询价公告|竞争性磋商(磋商)公告|竞[谈价]公告|合同公告|人员(名单)?公示|批复|终止公告|入围结果公告|中标结果公[示告]|" "意见公示)(?:[\((].+?[\))])?$|关于.*通知(?:[^书]|$)") othertype = "其他无关公告" # 投诉处理 re1_1 = re.compile("投诉[人方]|检举人|举报人[::]|投诉处理|终止投诉|投诉终止|撤诉|撤回投诉|质疑人|质疑单位|质疑[^,,。]*答复") re1_2 = re.compile("处理决定|回复") re1_type = '投诉处理' # 监督检查 re2 = re.compile("监督检查|监管调查|监督处理") re2_type = "监督检查" # 行政处罚 re3 = re.compile("行政处罚|行政处理") re3_type = "行政处罚" # 严重违法失信 re4 = re.compile("严重违法失信行为|严重违法失信企业|严重违法失信起名单") re4_type = "严重违法失信" # 其他失信公告 re_other = re.compile("关于[^,。]+?(?:处罚|处理|通报)|不良行为|不良信用|不良记录|不规范行为|不诚信行为|" "违[规法约]处[罚理]|处[理罚]依据|处罚日期|扣分依据|认定依据|处罚决定|违规情况|" "违[规法]行为|违规事项|考评依据|失信行为") re_otherType = "其他失信公告" punishType_list = [] for title,text in zip(data['PAGE_TITLE'],data['PAGE_CONTENT']): punishType = '' titleWithText = title + text if title_rule.search(title): punishType = othertype elif re1_1.search(titleWithText) or re.search("投[诉拆]",title): punishType = re1_type elif re1_2.search(titleWithText) and re.search("投诉",titleWithText): punishType = re1_type elif re2.search(titleWithText): punishType = re2_type elif re3.search(titleWithText): punishType = re3_type elif re4.search(titleWithText): punishType = re4_type elif re_other.search(titleWithText) or re.search("处罚",title): punishType = re_otherType punishType_list.append(punishType) data['punishType'] = punishType_list data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishType_test.csv",encoding='utf-8') def getNers_my(sentences,MAXAREA = 10000,useselffool=False): ''' @param: sentences:句子数 @return 限流执行后的实体识别list ''' def getData(ners,process_data): process_sentences = [item[1] for item in process_data] print(process_data) if useselffool: ner_ = selffool.self_ner(process_sentences) else: ner_ = selffool.ner(process_sentences) print('ner_ :',ner_) for i in range(len(ner_)): the_index = process_data[i][0] ners[the_index] = ner_[i] sents = [] for i in range(len(sentences)): sents.append([i,sentences[i]]) sents.sort(key=lambda x:len(x[1]),reverse=True) print(sents) index_ = 0 ners = [[]for i in range(len(sentences))] while(True): width = len(sents[index_][1]) height = MAXAREA//width+1 if height>len(sents)-index_: height = len(sents)-index_ process_data = sents[index_:index_+height] getData( ners, process_data) index_ += height if index_>=len(sents): break return ners # 网页公告处理 def get_article1(articles,cost_time = dict(),useselffool=True): ''' :param articles: 待处理的article source html :param useselffool: 是否使用selffool :return: list_articles ''' list_articles = [] for article in articles: a_time = time.time() sourceContent = article #表格处理 key_preprocess = "tableToText" start_time = time.time() article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml"))) # log(article_processed) if key_preprocess not in cost_time: cost_time[key_preprocess] = 0 cost_time[key_preprocess] += time.time()-start_time #article_processed = article[1] list_articles.append(article_processed) print(time.time()-a_time) return list_articles # 分句处理 def get_sentences1(list_articles,useselffool=True,cost_time=dict()): ''' :param list_articles: 经过预处理的article text :return: list_sentences ''' list_sentences = [] for article in list_articles: a_time = time.time() list_sentences_temp = [] #表格处理 key_preprocess = "tableToText" start_time = time.time() article_processed = article if key_preprocess not in cost_time: cost_time[key_preprocess] = 0 cost_time[key_preprocess] += time.time()-start_time #nlp处理 if article_processed is not None and len(article_processed)!=0: split_patten = "。" sentences = [] _begin = 0 sentences_set = set() for _iter in re.finditer(split_patten,article_processed): _sen = article_processed[_begin:_iter.span()[1]] if len(_sen)>0 and _sen not in sentences_set: sentences.append(_sen) sentences_set.add(_sen) _begin = _iter.span()[1] _sen = article_processed[_begin:] if len(_sen)>0 and _sen not in sentences_set: sentences.append(_sen) sentences_set.add(_sen) # article = "".join(sentences) # # sentences.append(article_processed[_begin:]) # # lemmas = [] # doc_offsets = [] # dep_types = [] # dep_tokens = [] # # time1 = time.time() ''' tokens_all = fool.cut(sentences) #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all) #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all) ner_entitys_all = fool.ner(sentences) ''' #限流执行 key_nerToken = "nerToken" start_time = time.time() # tokens_all = getTokens(sentences,useselffool=useselffool) if key_nerToken not in cost_time: cost_time[key_nerToken] = 0 cost_time[key_nerToken] += time.time()-start_time for sentence_index in range(len(sentences)): sentence_text = sentences[sentence_index] # tokens = tokens_all[sentence_index] # # #pos_tag = pos_all[sentence_index] # pos_tag = "" # # ner_entitys = "" list_sentences_temp.append(sentence_text) if len(list_sentences_temp)==0: list_sentences_temp.append(sentence_text) list_sentences.append(list_sentences_temp) print('2:',time.time()-a_time) return list_sentences def ronghe(): a = ",投诉处理决定书,投诉人:福建光正工程项目管理有限公司,联系地址:福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室,被投诉人:泉州台商投资区城市建设发展有限公司,泉州台商投资区水务投资经营有限公司,福建省富诚工程管理有限公司,联系地址:泉州台商投资区通港路大创商厦,一、投诉人投诉事项,投诉人按中标候选人公示的要求参加会议,由于提供的身份证原件于复印件版本不同而被废标,认为废标理由不成立。" ners = [(13, 28, 'company', '福建光正工程项目管理有限公司'), (33, 75, 'location', '福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室'), (80, 98, 'company', '泉州台商投资区城市建设发展有限公司'), (98, 116, 'company', '泉州台商投资区水务投资经营有限公司'), (116, 130, 'company', '福建省富诚工程管理有限公司'), (135, 150, 'location', '泉州台商投资区通港路大创商厦')] s = ['person', 'org', 'company', 'union'] remove_num = 0 for i in range(len(ners)): print(0) ner = ners[i] begin = ner[0] end = ner[1] type = ner[2] if type in s: if end == ners[i+1][0] and a[end-1]=='、': print(1) new_begin = begin new_end = ners[i+1][1] new_type = 'union' new_text = ner[3]+'、'+ners[i+1][3] new_ner = (new_begin,new_end,new_type,new_text) ners[i] = 0 ners[i+1] = new_ner remove_num += 1 continue if end == ners[i + 1][0] and a[end-1] == ',' and a[ners[i + 1][1]-1]==a[end-1]: print(2) new_begin = begin new_end = ners[i + 1][1] new_type = 'union' new_text = ner[3] + ',' + ners[i + 1][3] new_ner = (new_begin, new_end, new_type, new_text) ners[i] = 0 ners[i + 1] = new_ner remove_num += 1 for i in range(remove_num): ners.remove(0) print(ners) if __name__ == '__main__': # get_data1() # get_ners() # test02() # get_unionNers() # 投诉人、被投诉/处罚人 # get_complainant() # ronghe() # 分类 # textClassify() # 投诉是否成立、处罚决定(投诉) # get_punishWhether01() # 处罚决定(处罚) # get_punishDecision() # 执法机构、处罚时间 get_institution() # 处罚类型 # get_punishType() pass