|
@@ -0,0 +1,831 @@
|
|
|
|
+import sys
|
|
|
|
+import os
|
|
|
|
+sys.path.append(os.path.abspath("../.."))
|
|
|
|
+import pandas as pd
|
|
|
|
+import re
|
|
|
|
+from BiddingKG.dl.common.Utils import *
|
|
|
|
+from BiddingKG.dl.interface.Entitys import *
|
|
|
|
+from BiddingKG.dl.interface.predictor import *
|
|
|
|
+from BiddingKG.dl.foolnltk import selffool
|
|
|
|
+from BiddingKG.dl.interface.Preprocessing import *
|
|
|
|
+
|
|
|
|
+def get_data1():
|
|
|
|
+ load1 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_TOU_SU_CHU_LI.csv")
|
|
|
|
+ load2 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_WEI_FA_JI_LU.csv")
|
|
|
|
+ load3 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_QI_TA_SHI_XIN.csv")
|
|
|
|
+ load = pd.concat([load1, load2, load3], axis=0)
|
|
|
|
+ load = load.reset_index(drop=True)
|
|
|
|
+ load['PAGE_CONTENT'] = get_article1(load['PAGE_CONTENT'])
|
|
|
|
+ sentences_list = get_sentences1(load['PAGE_CONTENT'])
|
|
|
|
+ load['sentences'] = ['*#*>'.join(_sentences) for _sentences in sentences_list ]
|
|
|
|
+ load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv")
|
|
|
|
+
|
|
|
|
+def get_ners():
|
|
|
|
+ data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
|
|
|
|
+ # data = data.head(3)
|
|
|
|
+ nersList = []
|
|
|
|
+ for index,_sentences in zip(data.index,data['sentences']):
|
|
|
|
+ _sentences = _sentences.split('*#*>')
|
|
|
|
+ _ners = getNers(_sentences,useselffool=True)
|
|
|
|
+ word_index = 0
|
|
|
|
+ for ners,sentence in zip(_ners, _sentences):
|
|
|
|
+ if len(ners) != 0:
|
|
|
|
+ word_ner_list = ['O']*len(sentence)
|
|
|
|
+
|
|
|
|
+ for ner in ners:
|
|
|
|
+ nerDict = dict()
|
|
|
|
+ entity_type = ner[2]
|
|
|
|
+ nerDict['entity_type'] = entity_type
|
|
|
|
+ entity_text = ner[3]
|
|
|
|
+ nerDict['entity_text'] = entity_text
|
|
|
|
+ begin_index = ner[0]
|
|
|
|
+ nerDict['begin_index'] = begin_index
|
|
|
|
+ end_index = ner[1] - 1
|
|
|
|
+ nerDict['end_index'] = end_index
|
|
|
|
+ wordOffset_begin = word_index + begin_index
|
|
|
|
+ nerDict['wordOffset_begin'] = wordOffset_begin
|
|
|
|
+ wordOffset_end = wordOffset_begin + len(entity_text)
|
|
|
|
+ nerDict['wordOffset_end'] = wordOffset_end
|
|
|
|
+ nerDict['sentence'] = sentence
|
|
|
|
+ nerDict['article_index'] = index
|
|
|
|
+ # print('====')
|
|
|
|
+ # print(begin_index,end_index,entity_type,entity_text)
|
|
|
|
+ nersList.append(nerDict)
|
|
|
|
+ # print(nerDict)
|
|
|
|
+ word_ner_list[begin_index] = 'B'
|
|
|
|
+ word_ner_list[begin_index+1:end_index] = ['I']*(end_index-begin_index-1)
|
|
|
|
+ word_index += len(sentence)
|
|
|
|
+ # save(nersList,"nersList.pk")
|
|
|
|
+
|
|
|
|
+# 相邻的(org、company)(person)合并
|
|
|
|
+def get_unionNers():
|
|
|
|
+ data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
|
|
|
|
+ ners = load("nersList.pk")
|
|
|
|
+ org_companys = [[] for _ in range(len(data))]
|
|
|
|
+ type1 = ['org', 'company', 'union_oc']
|
|
|
|
+ persons = [[] for _ in range(len(data))]
|
|
|
|
+ type2 = ['person', 'union_person']
|
|
|
|
+ for ner in ners:
|
|
|
|
+ if ner['entity_type'] in type1:
|
|
|
|
+ org_companys[ner['article_index']].append(ner)
|
|
|
|
+ if ner['entity_type'] in type2:
|
|
|
|
+ persons[ner['article_index']].append(ner)
|
|
|
|
+ # 合并 org 和 company
|
|
|
|
+ new_org_companys = []
|
|
|
|
+ for org_company in org_companys:
|
|
|
|
+ if org_company and len(org_company) > 1:
|
|
|
|
+ union_nums = 0
|
|
|
|
+ for i in range(len(org_company)-1):
|
|
|
|
+ if org_company[i]['end_index'] == org_company[i + 1]['begin_index'] - 1 and org_company[i]['sentence'][org_company[i]['end_index']] == '、' \
|
|
|
|
+ and org_company[i]['sentence'] == org_company[i + 1]['sentence']:
|
|
|
|
+ # print(1)
|
|
|
|
+ org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
|
|
|
|
+ org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
|
|
|
|
+ org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
|
|
|
|
+ # print(org_company[i + 1]['entity_text'])
|
|
|
|
+ org_company[i] = 0
|
|
|
|
+ union_nums += 1
|
|
|
|
+ elif org_company[i]['end_index'] == org_company[i + 1]['begin_index'] and org_company[i]['sentence'] == org_company[i+1]['sentence']:
|
|
|
|
+ # print(2)
|
|
|
|
+ org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
|
|
|
|
+ org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
|
|
|
|
+ org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
|
|
|
|
+ # print(org_company[i + 1]['entity_text'])
|
|
|
|
+ org_company[i] = 0
|
|
|
|
+ union_nums += 1
|
|
|
|
+ for _ in range(union_nums):
|
|
|
|
+ org_company.remove(0)
|
|
|
|
+ new_org_companys.append(org_company)
|
|
|
|
+ # 合并person
|
|
|
|
+ new_persons = []
|
|
|
|
+ for person in persons:
|
|
|
|
+ if person and len(person) > 1:
|
|
|
|
+ union_nums = 0
|
|
|
|
+ for i in range(len(person) - 1):
|
|
|
|
+ if person[i]['end_index'] == person[i + 1]['begin_index'] - 1 and person[i]['sentence'][person[i]['end_index']] == '、' \
|
|
|
|
+ and person[i]['sentence'] == person[i + 1]['sentence']:
|
|
|
|
+ # print(1)
|
|
|
|
+ person[i + 1]['begin_index'] = person[i]['begin_index']
|
|
|
|
+ person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
|
|
|
|
+ person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
|
|
|
|
+ # print(person[i + 1]['entity_text'])
|
|
|
|
+ person[i] = 0
|
|
|
|
+ union_nums += 1
|
|
|
|
+ elif person[i]['end_index'] == person[i + 1]['begin_index'] and person[i]['sentence'] == person[i + 1]['sentence']:
|
|
|
|
+ # print(2)
|
|
|
|
+ person[i + 1]['begin_index'] = person[i]['begin_index']
|
|
|
|
+ person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
|
|
|
|
+ person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
|
|
|
|
+ # print(person[i + 1]['entity_text'])
|
|
|
|
+ person[i] = 0
|
|
|
|
+ union_nums += 1
|
|
|
|
+ for _ in range(union_nums):
|
|
|
|
+ person.remove(0)
|
|
|
|
+ new_persons.append(person)
|
|
|
|
+ # save([new_org_companys,new_persons],"unionNers.pk")
|
|
|
|
+
|
|
|
|
+def test02():
|
|
|
|
+ load = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
|
|
|
|
+
|
|
|
|
+ text_rule = re.compile("监管调查|通报|不诚信|监督检查|不良|投诉|质疑|处罚|违法|违规|不予[受处]理|处理")
|
|
|
|
+ title_rule = re.compile("中标公告|中标[(\(]成交[\))]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
|
|
|
|
+ "|补贴公[示告]|废标公[示告]")
|
|
|
|
+ # need_index = []
|
|
|
|
+ # for index, title, text in zip(load.index, load['PAGE_TITLE'], load['PAGE_CONTENT']):
|
|
|
|
+ # a = 0
|
|
|
|
+ # if text_rule.search(text):
|
|
|
|
+ # a = 1
|
|
|
|
+ # if title_rule.search(title):
|
|
|
|
+ # a = 0
|
|
|
|
+ # if text_rule.search(title):
|
|
|
|
+ # a = 1
|
|
|
|
+ # if a:
|
|
|
|
+ # need_index.append(index)
|
|
|
|
+ # print(len(need_index))
|
|
|
|
+ # load = load.loc[need_index]
|
|
|
|
+ # print(len(load))
|
|
|
|
+ # load = load.reset_index(drop=True)
|
|
|
|
+
|
|
|
|
+ complainants_rule1 = re.compile("[^被]投[诉拆][人方]之?[\d一二三四五六七八九十]?(?:(.+?))?[::]+?")
|
|
|
|
+ complaint_rule = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|疑问[人方]|检举[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?名称)?[::]+")
|
|
|
|
+ complainants_list = []
|
|
|
|
+ a = 1
|
|
|
|
+ load = load[9744:9745]
|
|
|
|
+ for article,sentences in zip(load['PAGE_CONTENT'],load['sentences']):
|
|
|
|
+ print(a)
|
|
|
|
+ a+=1
|
|
|
|
+ getSentences = sentences.split('*#*>')
|
|
|
|
+ # print(getSentences)
|
|
|
|
+ ners = getNers(getSentences,useselffool=True)
|
|
|
|
+ print(ners)
|
|
|
|
+ print('======================')
|
|
|
|
+ word_index = 0
|
|
|
|
+ ners_list = []
|
|
|
|
+ for ner,sentence in zip(ners,getSentences):
|
|
|
|
+ size = 16
|
|
|
|
+ complainants = []
|
|
|
|
+ if len(ner)!=0:
|
|
|
|
+ for aner in ner:
|
|
|
|
+
|
|
|
|
+ entity_type = aner[2]
|
|
|
|
+ entity_text = aner[3]
|
|
|
|
+ # begin = word_index + aner[0]
|
|
|
|
+ # end = begin + len(entity_text)
|
|
|
|
+ # 投诉人
|
|
|
|
+ if entity_type in ['org','company','person']:
|
|
|
|
+ left = sentence[max(0, aner[0] - size):aner[0]]
|
|
|
|
+
|
|
|
|
+ print(entity_text,left,sentence)
|
|
|
|
+ if complaint_rule.search(left):
|
|
|
|
+ print('yes')
|
|
|
|
+ entity_type = 'complainant'
|
|
|
|
+ complainants.append(entity_text)
|
|
|
|
+ # ners_list.append([begin, end, entity_type, entity_text])
|
|
|
|
+ word_index += len(sentence)
|
|
|
|
+ complainants_list.append(complainants)
|
|
|
|
+
|
|
|
|
+ # test
|
|
|
|
+ # for i in ners_list:
|
|
|
|
+ # print(i[3])
|
|
|
|
+ # print(processed[0][i[0]:i[1]])
|
|
|
|
+ load['complainant'] = complainants_list
|
|
|
|
+ # load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\test01.csv")
|
|
|
|
+
|
|
|
|
+# 投诉人、被投诉人、被处罚人
|
|
|
|
+def get_complainant():
|
|
|
|
+ data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2.xlsx",index_col=0)
|
|
|
|
+ # ners = load("nersList.pk")
|
|
|
|
+ unionNers = load("unionNers.pk")
|
|
|
|
+ ners = [i+j for i,j in zip(unionNers[0],unionNers[1])]
|
|
|
|
+ complainants = [[] for _ in range(len(data))]
|
|
|
|
+ punishPeople = [[] for _ in range(len(data))]
|
|
|
|
+ a = ['org','company','person']
|
|
|
|
+ size = 16
|
|
|
|
+ # 投诉人、质疑人
|
|
|
|
+ complainants_rule1 = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
|
|
|
|
+ # 被处罚人,被投诉人
|
|
|
|
+ punishPeople_rule1 = re.compile("(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
|
|
|
|
+ punishPeople_rule2_1 = re.compile(",$")
|
|
|
|
+ punishPeople_rule2_2 = re.compile("^[::]")
|
|
|
|
+ punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
|
|
|
|
+ punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
|
|
|
|
+
|
|
|
|
+ time1 = time.time()
|
|
|
|
+ for _ner in ners:
|
|
|
|
+ if _ner:
|
|
|
|
+ for ner in _ner:
|
|
|
|
+ left = ner['sentence'][max(0,ner['begin_index']-size):ner['begin_index']]
|
|
|
|
+ right = ner['sentence'][ner['end_index']:min(ner['end_index']+size,len(ner['sentence']))]
|
|
|
|
+ # print(left)
|
|
|
|
+ if complainants_rule1.search(left):
|
|
|
|
+ complainants[ner['article_index']].append(ner['entity_text'])
|
|
|
|
+ elif punishPeople_rule1.search(left):
|
|
|
|
+ punishPeople[ner['article_index']].append(ner['entity_text'])
|
|
|
|
+ elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
|
|
|
|
+ if data['类别'][ner['article_index']] == '投诉处理':
|
|
|
|
+ complainants[ner['article_index']].append(ner['entity_text'])
|
|
|
|
+ else:
|
|
|
|
+ punishPeople[ner['article_index']].append(ner['entity_text'])
|
|
|
|
+ elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
|
|
|
|
+ punishPeople[ner['article_index']].append(ner['entity_text'])
|
|
|
|
+ data['complainant'] = complainants
|
|
|
|
+ data['punishPeople'] = punishPeople
|
|
|
|
+ print(time.time()-time1)
|
|
|
|
+ data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx")
|
|
|
|
+
|
|
|
|
+def get_complainant2(list_sentences, list_entitys, text_type):
|
|
|
|
+ '''
|
|
|
|
+ list_sentences: get_preprocessed() 得list_sentences
|
|
|
|
+ list_entitys: get_preprocessed() 得list_entitys
|
|
|
|
+ text_type: 文章类别(处罚类型)
|
|
|
|
+ :return:
|
|
|
|
+ complainants :投诉人列表
|
|
|
|
+ punishPeople: 被投诉人/被处罚人
|
|
|
|
+ '''
|
|
|
|
+ sentences_list = list_sentences
|
|
|
|
+ entitys_list = list_entitys
|
|
|
|
+ size = 16
|
|
|
|
+ a = ['org', 'company', 'person']
|
|
|
|
+ b = ['org', 'company', 'union_org_company']
|
|
|
|
+ c = ['person', 'union_person']
|
|
|
|
+ need_entitys = []
|
|
|
|
+ for entity in entitys_list:
|
|
|
|
+ if entity.entity_type in a:
|
|
|
|
+ need_entitys.append(entity)
|
|
|
|
+ # 实体合并
|
|
|
|
+ drop_count = 0
|
|
|
|
+ for i in range(1, len(need_entitys)):
|
|
|
|
+ entity = need_entitys[i]
|
|
|
|
+ entity_begin = entity.wordOffset_begin
|
|
|
|
+ entity_end = entity.wordOffset_end
|
|
|
|
+ sentence = sentences_list[entity.sentence_index].sentence_text
|
|
|
|
+ last_entity = need_entitys[i - 1]
|
|
|
|
+ if entity.sentence_index == last_entity.sentence_index:
|
|
|
|
+ if (entity.entity_type in b and last_entity.entity_type in b) or (
|
|
|
|
+ entity.entity_type in c and last_entity.entity_type in c):
|
|
|
|
+ if entity_begin - last_entity.wordOffset_end < 2 and sentence[
|
|
|
|
+ last_entity.wordOffset_end:entity_begin] in ['',
|
|
|
|
+ '、',
|
|
|
|
+ '和',
|
|
|
|
+ '及']:
|
|
|
|
+ need_entitys[i].wordOffset_begin = last_entity.wordOffset_begin
|
|
|
|
+ need_entitys[i].begin_index = last_entity.begin_index
|
|
|
|
+ need_entitys[i].entity_text = last_entity.entity_text + '+' + entity.entity_text
|
|
|
|
+ if entity.entity_type in b:
|
|
|
|
+ need_entitys[i].entity_type = 'union_org_company'
|
|
|
|
+ else:
|
|
|
|
+ need_entitys[i].entity_type = 'union_person'
|
|
|
|
+ need_entitys[i - 1] = 0
|
|
|
|
+ drop_count += 1
|
|
|
|
+ for _ in range(drop_count):
|
|
|
|
+ need_entitys.remove(0)
|
|
|
|
+ # 投诉人、质疑人
|
|
|
|
+ complainants_rule1 = re.compile(
|
|
|
|
+ "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
|
|
|
|
+ # 被处罚人,被投诉人
|
|
|
|
+ punishPeople_rule1 = re.compile(
|
|
|
|
+ "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
|
|
|
|
+ punishPeople_rule2_1 = re.compile(",$")
|
|
|
|
+ punishPeople_rule2_2 = re.compile("^[::]")
|
|
|
|
+ punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
|
|
|
|
+ punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
|
|
|
|
+ complainants = []
|
|
|
|
+ punishPeople = []
|
|
|
|
+ for i in range(len(need_entitys)):
|
|
|
|
+ entity = need_entitys[i]
|
|
|
|
+ entity_begin = entity.wordOffset_begin
|
|
|
|
+ entity_end = entity.wordOffset_end
|
|
|
|
+
|
|
|
|
+ # entity所在句子
|
|
|
|
+ sentence = sentences_list[entity.sentence_index].sentence_text
|
|
|
|
+ left = sentence[max(0, entity_begin - size):entity_begin]
|
|
|
|
+ right = sentence[entity_end:min(entity_end + size, len(sentence))]
|
|
|
|
+
|
|
|
|
+ if complainants_rule1.search(left):
|
|
|
|
+ complainants.append(entity)
|
|
|
|
+ elif punishPeople_rule1.search(left):
|
|
|
|
+ punishPeople.append(entity)
|
|
|
|
+ elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
|
|
|
|
+ if text_type == '投诉处理':
|
|
|
|
+ complainants.append(entity)
|
|
|
|
+ else:
|
|
|
|
+ punishPeople.append(entity)
|
|
|
|
+ elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
|
|
|
|
+ punishPeople.append(entity)
|
|
|
|
+
|
|
|
|
+ result_complainants = []
|
|
|
|
+ result_punishPeople = []
|
|
|
|
+ for entity in complainants:
|
|
|
|
+ if entity.entity_type in ['union_org_company', 'union_person']:
|
|
|
|
+ entity_text = entity.entity_text.split('+')
|
|
|
|
+ for item in entity_text:
|
|
|
|
+ result_complainants.append(item)
|
|
|
|
+ else:
|
|
|
|
+ result_complainants.append(entity.entity_text)
|
|
|
|
+ for entity in punishPeople:
|
|
|
|
+ if entity.entity_type in ['union_org_company', 'union_person']:
|
|
|
|
+ entity_text = entity.entity_text.split('+')
|
|
|
|
+ for item in entity_text:
|
|
|
|
+ result_punishPeople.append(item)
|
|
|
|
+ else:
|
|
|
|
+ result_punishPeople.append(entity.entity_text)
|
|
|
|
+ return list(set(result_complainants)), list(set(result_punishPeople))
|
|
|
|
+
|
|
|
|
+# 公告分类
|
|
|
|
+def textClassify():
|
|
|
|
+ data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
|
|
|
|
+ #投诉人|检举人|举报人|质疑人|质疑函
|
|
|
|
+ patten1 = "投诉人|检举人|举报人|质疑人|质疑函|投诉处理|质疑单位"
|
|
|
|
+ re1 = re.compile(patten1)
|
|
|
|
+ patten2 = "不予[处受]理|撤诉|撤[销回]投诉|投诉终止"
|
|
|
|
+ re2 = re.compile(patten2)
|
|
|
|
+ patten3 = "关于[^,。]+?(?:处罚|通报|处理意见)|被处罚人|处罚决定|限制行为开始时间|处罚执行部门"
|
|
|
|
+ re3 = re.compile(patten3)
|
|
|
|
+ patten4 = "不良行为|不良信用|不良记录|不规范行为|不诚信行为"
|
|
|
|
+ re4 = re.compile(patten4)
|
|
|
|
+ patten5 = "行政处罚|行政处理|监督检查|监管调查|监督处理|违规处[罚理]|违法处[罚理]"
|
|
|
|
+ re5 = re.compile(patten5)
|
|
|
|
+ patten6 = "严重违法失信起名单|严重违法失信行为|严重违法失信企业"
|
|
|
|
+ re6 = re.compile(patten6)
|
|
|
|
+ patten7 = '处理决定'
|
|
|
|
+ re7 = re.compile(patten7)
|
|
|
|
+ patten8 = "处[理罚]依据|处罚日期|扣分依据|认定依据"
|
|
|
|
+ re8 = re.compile(patten8)
|
|
|
|
+ pos = []
|
|
|
|
+ _type = []
|
|
|
|
+ for title,text in zip(data['PAGE_TITLE'],data["PAGE_CONTENT"]):
|
|
|
|
+ p = []
|
|
|
|
+ t = ''
|
|
|
|
+ if re1.search(text) or re1.search(title):
|
|
|
|
+ p.append(patten1)
|
|
|
|
+ t = '投诉'
|
|
|
|
+ elif re2.search(text) and re.search('投诉',text):
|
|
|
|
+ p.append('投诉+'+patten2)
|
|
|
|
+ t = '投诉'
|
|
|
|
+ elif re.search("回复",title):
|
|
|
|
+ p.append("回复")
|
|
|
|
+ t = '投诉'
|
|
|
|
+ if len(p)==0:
|
|
|
|
+ if re3.search(title) or re3.search(text):
|
|
|
|
+ p.append(patten3)
|
|
|
|
+ t = '处罚'
|
|
|
|
+ elif re4.search(title):
|
|
|
|
+ p.append(patten4)
|
|
|
|
+ t = '处罚'
|
|
|
|
+ elif re5.search(title) or re5.search(text):
|
|
|
|
+ p.append(patten5)
|
|
|
|
+ t = '处罚'
|
|
|
|
+ elif re6.search(text) or re6.search(title):
|
|
|
|
+ p.append(patten6)
|
|
|
|
+ t = '处罚'
|
|
|
|
+ elif re8.search(text):
|
|
|
|
+ p.append(patten8)
|
|
|
|
+ t = '处罚'
|
|
|
|
+ if len(p) == 0:
|
|
|
|
+ if re7.search(text) and re.search('投诉', text):
|
|
|
|
+ p.append('投诉+' + patten7)
|
|
|
|
+ t = '投诉'
|
|
|
|
+ elif re7.search(text) or re7.search(title):
|
|
|
|
+ p.append("处罚+"+patten7)
|
|
|
|
+ t = '处罚'
|
|
|
|
+ pos.append(p)
|
|
|
|
+ _type.append(t)
|
|
|
|
+ data['pos'] = pos
|
|
|
|
+ data['type'] = _type
|
|
|
|
+ data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv")
|
|
|
|
+
|
|
|
|
+# 投诉是否成立
|
|
|
|
+def get_punishWhether01():
|
|
|
|
+ data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv",index_col=0)
|
|
|
|
+ data = data[data['type']=='投诉']
|
|
|
|
+ punishWhether_1 = re.compile("投诉[^。,,不]+?成立|投诉[^。,,]*[^不]属实|情况[^。,,]*[^不]属实|投诉成立|情况属实|予以支持")
|
|
|
|
+ punishWhether_0 = re.compile("投诉[^。,,]*不能?成立|撤诉|[^逾将]{4,}不予[受处]理|撤[回销][^。,,]*(?:举报|投诉)|驳回[^。,,]*投诉|投诉终止|终止[^。,,]*投诉|情况[^。,,]*不属实|投诉[^。,,]*不属实|缺乏事实依据|不予支持|予以驳回")
|
|
|
|
+ punishWhether = []
|
|
|
|
+ punishDecision = []
|
|
|
|
+ punishDecision_1 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[::]((?:(?:[\d一二三四五六七八九十]|[\((][\d一二三四五六七八九十][\))]|投[诉拆]事项[\d一二三四五六七八九十]).+?。)+)")
|
|
|
|
+ punishDecision_2 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[::]([^。]+?(?:。|$))")
|
|
|
|
+ punishDecision_3 = re.compile("[\d一二三四五六七八九十]、(?:处理,?意见|[裁决|处理]依据及结果|处理(?:决定|结果)|投诉处理决定),(.+?)。[\d一二三四五六七八九十]、")
|
|
|
|
+ punishDecision_4 = re.compile("(?:[\d一二三四五六七八九十]、处理,?意见|综上所述|[裁决|处理]依据及结果|综上|[\d一二三四五六七八九十]、处理(?:决定|结果)|经研究决定|[\d一二三四五六七八九十]、投诉处理决定),([^。]+?(?:。|$))")
|
|
|
|
+ punishDecision_5 = re.compile("(本机关决定|本机关认为|经审查.+?(?:。|$))")
|
|
|
|
+ punishDecision_6 = re.compile("((?:依据|按照|根据|依照)[^::。].+?(?:。|$))")
|
|
|
|
+
|
|
|
|
+ def findDecision(text):
|
|
|
|
+ decision = ''
|
|
|
|
+ if punishDecision_1.search(text):
|
|
|
|
+ decision = punishDecision_1.search(text).group(1)
|
|
|
|
+
|
|
|
|
+ elif punishDecision_2.search(text):
|
|
|
|
+ decision = punishDecision_2.search(text).group(1)
|
|
|
|
+ elif punishDecision_3.search(text):
|
|
|
|
+ decision = punishDecision_3.search(text).group(1)
|
|
|
|
+ elif punishDecision_4.search(text):
|
|
|
|
+ decision = punishDecision_4.findall(text)
|
|
|
|
+ decision = decision[-1]
|
|
|
|
+ elif punishDecision_5.search(text):
|
|
|
|
+ decision = punishDecision_5.search(text).group(1)
|
|
|
|
+ elif punishDecision_6.search(text):
|
|
|
|
+ decision = punishDecision_6.findall(text)
|
|
|
|
+ decision1 = decision[-1]
|
|
|
|
+ if re.search("诉讼",decision1) and len(decision)>1:
|
|
|
|
+ decision1 = decision[-2]
|
|
|
|
+ decision = decision1
|
|
|
|
+ return decision
|
|
|
|
+
|
|
|
|
+ for text in data['PAGE_CONTENT']:
|
|
|
|
+ pw = ''
|
|
|
|
+ if punishWhether_1.search(text):
|
|
|
|
+ pw = 1
|
|
|
|
+ elif punishWhether_0.search(text):
|
|
|
|
+ pw = 0
|
|
|
|
+ punishWhether.append(pw)
|
|
|
|
+
|
|
|
|
+ mid = len(text)//2
|
|
|
|
+ lower_half = text[mid:]
|
|
|
|
+ decision = findDecision(lower_half)
|
|
|
|
+ if decision == '':
|
|
|
|
+ decision = findDecision(text)
|
|
|
|
+
|
|
|
|
+ # if punishDecision_1.search(text):
|
|
|
|
+ # decision = punishDecision_1.search(text).group(1)
|
|
|
|
+ #
|
|
|
|
+ # elif punishDecision_2.search(text):
|
|
|
|
+ # decision = punishDecision_2.search(text).group(1)
|
|
|
|
+ # elif punishDecision_3.search(text):
|
|
|
|
+ # decision = punishDecision_3.search(text).group(1)
|
|
|
|
+ # elif punishDecision_4.search(text):
|
|
|
|
+ # decision = punishDecision_4.findall(text)
|
|
|
|
+ # decision = decision[-1]
|
|
|
|
+ # elif punishDecision_5.search(text):
|
|
|
|
+ # decision = punishDecision_5.findall(text)
|
|
|
|
+ # decision = decision[-1]
|
|
|
|
+ punishDecision.append(decision)
|
|
|
|
+ data['punishWhether'] = punishWhether
|
|
|
|
+ data['punishDecision'] = punishDecision
|
|
|
|
+ data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishWhether&Decision.csv")
|
|
|
|
+# 处罚决定
|
|
|
|
+def get_punishDecision():
|
|
|
|
+ data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv", index_col=0)
|
|
|
|
+ data = data[data['type'] == '处罚']
|
|
|
|
+ punishDecision_1 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[::]+((?:(?:[\d一二三四五六七八九十]|[\((][\d一二三四五六七八九十][\))]).+?。)+)")
|
|
|
|
+ punishDecision_2 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[::]+(.+?(?:。|$))")
|
|
|
|
+ punishDecision_3 = re.compile("(扣分分?值[::][\d.]+分?)")
|
|
|
|
+ punishDecision_4 = re.compile("[\d一二三四五六七八九十]、(?:处理结果|处理决定|处理依据[和及]处理结果|处理依据及结果|处罚决定|处罚结果|整改意见),(.+?)。[\d一二三四五六七八九十]、")
|
|
|
|
+ punishDecision_5 = re.compile("(?:处理结果|[\d一二三四五六七八九十]、处理决定|处理依据及处理结果|处理依据及结果|经研究|经研究决定|[\d一二三四五六七八九十]、处罚决定|处罚结果|整改意见),+(.+?(?:。|$))")
|
|
|
|
+ punishDecision_6 = re.compile("(?:本机关决定|我局决定)(.+?(?:。|$))")
|
|
|
|
+ punishDecision_7 = re.compile("((?:依据|按照|根据|依照)[^::。].+?(?:。|$))")
|
|
|
|
+ punishDecision = []
|
|
|
|
+ for text in data['PAGE_CONTENT']:
|
|
|
|
+ decision = ''
|
|
|
|
+ if punishDecision_1.search(text):
|
|
|
|
+ decision = punishDecision_1.search(text).group(1)
|
|
|
|
+ elif punishDecision_2.search(text):
|
|
|
|
+ decision = punishDecision_2.search(text).group(1)
|
|
|
|
+ elif punishDecision_3.search(text):
|
|
|
|
+ decision = punishDecision_3.search(text).group(1)
|
|
|
|
+ elif punishDecision_4.search(text):
|
|
|
|
+ decision = punishDecision_4.search(text).group(1)
|
|
|
|
+ elif punishDecision_5.search(text):
|
|
|
|
+ decision = punishDecision_5.findall(text)
|
|
|
|
+ decision = decision[-1]
|
|
|
|
+ elif punishDecision_6.search(text):
|
|
|
|
+ decision = punishDecision_6.search(text).group(1)
|
|
|
|
+ elif punishDecision_7.search(text):
|
|
|
|
+ decision = punishDecision_7.findall(text)
|
|
|
|
+ decision = decision[-1]
|
|
|
|
+ punishDecision.append(decision)
|
|
|
|
+ data['punishDecision'] = punishDecision
|
|
|
|
+ data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishDecision处罚.csv")
|
|
|
|
+
|
|
|
|
+# 执法机构、处罚时间
|
|
|
|
+def get_institution():
|
|
|
|
+ data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx", index_col=0)
|
|
|
|
+ ners = load("nersList.pk")
|
|
|
|
+ orgs = [[] for _ in range(len(data))]
|
|
|
|
+ times = [[] for _ in range(len(data))]
|
|
|
|
+ institutions = [[] for _ in range(len(data))]
|
|
|
|
+ punishTimes = [[] for _ in range(len(data))]
|
|
|
|
+ institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
|
|
|
|
+ punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
|
|
|
|
+ for ner in ners:
|
|
|
|
+ if ner['entity_type'] == 'org':
|
|
|
|
+ left = ner['sentence'][max(0,ner['begin_index']-15):ner['begin_index']]
|
|
|
|
+ if institution_1.search(left):
|
|
|
|
+ institutions[ner['article_index']].append(ner['entity_text'])
|
|
|
|
+ orgs[ner['article_index']].append(ner)
|
|
|
|
+ elif ner['entity_type'] =='time':
|
|
|
|
+ left = ner['sentence'][max(0, ner['begin_index'] - 15):ner['begin_index']]
|
|
|
|
+ if punishTimes_1.search(left):
|
|
|
|
+ punishTimes[ner['article_index']].append(ner['entity_text'])
|
|
|
|
+ times[ner['article_index']].append(ner)
|
|
|
|
+ orgs = [org[-5:] if len(org)>5 else org for org in orgs]
|
|
|
|
+ times = [time[-3:] if len(time)>3 else time for time in times]
|
|
|
|
+ data['org'] = orgs
|
|
|
|
+ data['time'] = times
|
|
|
|
+ data['institution'] = institutions
|
|
|
|
+ data['punishTime'] = punishTimes
|
|
|
|
+ # data = data[data['type'].isin(["投诉","处罚"])]
|
|
|
|
+ print(len(data))
|
|
|
|
+ # data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv")
|
|
|
|
+ # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv", index_col=0)
|
|
|
|
+ institution_list = []
|
|
|
|
+ punishTime_list = []
|
|
|
|
+ institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
|
|
|
|
+ institution_time = re.compile("(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
|
|
|
|
+ for title,text,org,n_time,institution,punishTime in zip(data['PAGE_TITLE'],data['PAGE_CONTENT'],data['org'],data['time'],data['institution'],data['punishTime']):
|
|
|
|
+ ins = ''
|
|
|
|
+ ptime = ''
|
|
|
|
+ if punishTime:
|
|
|
|
+ ptime = punishTime
|
|
|
|
+ if institution:
|
|
|
|
+ ins = institution
|
|
|
|
+ else:
|
|
|
|
+ title_ners = getNers([title],useselffool=True)
|
|
|
|
+ if title_ners[0]:
|
|
|
|
+
|
|
|
|
+ for title_ner in title_ners[0]:
|
|
|
|
+
|
|
|
|
+ if title_ner[2]=='org' and institution_title.search(title_ner[3]):
|
|
|
|
+ # 'title:'+
|
|
|
|
+ ins = title_ner[3]
|
|
|
|
+ # print(title_ner[3])
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ # if ins == '':
|
|
|
|
+ for _org in org[::-1]:
|
|
|
|
+ right = _org['sentence'][_org['end_index']:min(len(_org['sentence']),_org['end_index']+16)]
|
|
|
|
+ if institution_time.search(right):
|
|
|
|
+ if ins == '':
|
|
|
|
+ # "text_EndWithTime:" +
|
|
|
|
+ ins = _org['entity_text']
|
|
|
|
+ if ptime == '':
|
|
|
|
+ # "text_EndWithIns:" +
|
|
|
|
+ ptime =institution_time.search(right).group(1)
|
|
|
|
+ break
|
|
|
|
+ if ptime == '' and len(n_time) != 0:
|
|
|
|
+ textLong = len(text)
|
|
|
|
+ if n_time[-1]['wordOffset_end'] > textLong-3 and len(n_time[-1]['entity_text'])>3:
|
|
|
|
+ # "EndOfText:" +
|
|
|
|
+ ptime = n_time[-1]['entity_text']
|
|
|
|
+
|
|
|
|
+ institution_list.append(ins)
|
|
|
|
+ punishTime_list.append(ptime)
|
|
|
|
+ data['institution'] = institution_list
|
|
|
|
+ data['punishTime'] = punishTime_list
|
|
|
|
+ data = data.drop(columns=['org','time'],axis=1)
|
|
|
|
+ data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-2.xlsx")
|
|
|
|
+
|
|
|
|
+# 处罚类型
|
|
|
|
+def get_punishType():
|
|
|
|
+ data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
|
|
|
|
+ # 暂定:严重违法失信,行政处罚,投诉处理,监督检查,其他失信记录
|
|
|
|
+
|
|
|
|
+ # 其他无关公告
|
|
|
|
+ title_rule = re.compile("(?:中标公[示告]|中标[(\(]成交[\))]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
|
|
|
|
+ "|补贴公[示告]|废标公[示告]|备案公[示告]|数据统计|选取公告|流标公告|变更公告|入围公告|征集公告|执行情况|"
|
|
|
|
+ "登记公告|竞争性磋商公告|报名的公[示告]|竞争性谈判公告|邀请函|竞标公告|采购公告|招标公告|议标公告|预审公告|"
|
|
|
|
+ "询价公告|竞争性磋商(磋商)公告|竞[谈价]公告|合同公告|人员(名单)?公示|批复|终止公告|入围结果公告|中标结果公[示告]|"
|
|
|
|
+ "意见公示)(?:[\((].+?[\))])?$|关于.*通知(?:[^书]|$)")
|
|
|
|
+ othertype = "其他无关公告"
|
|
|
|
+ # 投诉处理
|
|
|
|
+ re1_1 = re.compile("投诉[人方]|检举人|举报人[::]|投诉处理|终止投诉|投诉终止|撤诉|撤回投诉|质疑人|质疑单位|质疑[^,,。]*答复")
|
|
|
|
+ re1_2 = re.compile("处理决定|回复")
|
|
|
|
+ re1_type = '投诉处理'
|
|
|
|
+ # 监督检查
|
|
|
|
+ re2 = re.compile("监督检查|监管调查|监督处理")
|
|
|
|
+ re2_type = "监督检查"
|
|
|
|
+ # 行政处罚
|
|
|
|
+ re3 = re.compile("行政处罚|行政处理")
|
|
|
|
+ re3_type = "行政处罚"
|
|
|
|
+ # 严重违法失信
|
|
|
|
+ re4 = re.compile("严重违法失信行为|严重违法失信企业|严重违法失信起名单")
|
|
|
|
+ re4_type = "严重违法失信"
|
|
|
|
+ # 其他失信公告
|
|
|
|
+ re_other = re.compile("关于[^,。]+?(?:处罚|处理|通报)|不良行为|不良信用|不良记录|不规范行为|不诚信行为|"
|
|
|
|
+ "违[规法约]处[罚理]|处[理罚]依据|处罚日期|扣分依据|认定依据|处罚决定|违规情况|"
|
|
|
|
+ "违[规法]行为|违规事项|考评依据|失信行为")
|
|
|
|
+ re_otherType = "其他失信公告"
|
|
|
|
+ punishType_list = []
|
|
|
|
+ for title,text in zip(data['PAGE_TITLE'],data['PAGE_CONTENT']):
|
|
|
|
+ punishType = ''
|
|
|
|
+ titleWithText = title + text
|
|
|
|
+ if title_rule.search(title):
|
|
|
|
+ punishType = othertype
|
|
|
|
+ elif re1_1.search(titleWithText) or re.search("投[诉拆]",title):
|
|
|
|
+ punishType = re1_type
|
|
|
|
+ elif re1_2.search(titleWithText) and re.search("投诉",titleWithText):
|
|
|
|
+ punishType = re1_type
|
|
|
|
+ elif re2.search(titleWithText):
|
|
|
|
+ punishType = re2_type
|
|
|
|
+ elif re3.search(titleWithText):
|
|
|
|
+ punishType = re3_type
|
|
|
|
+ elif re4.search(titleWithText):
|
|
|
|
+ punishType = re4_type
|
|
|
|
+ elif re_other.search(titleWithText) or re.search("处罚",title):
|
|
|
|
+ punishType = re_otherType
|
|
|
|
+ punishType_list.append(punishType)
|
|
|
|
+ data['punishType'] = punishType_list
|
|
|
|
+ data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishType_test.csv",encoding='utf-8')
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def getNers_my(sentences,MAXAREA = 10000,useselffool=False):
|
|
|
|
+ '''
|
|
|
|
+ @param: sentences:句子数
|
|
|
|
+ @return 限流执行后的实体识别list
|
|
|
|
+ '''
|
|
|
|
+ def getData(ners,process_data):
|
|
|
|
+ process_sentences = [item[1] for item in process_data]
|
|
|
|
+ print(process_data)
|
|
|
|
+ if useselffool:
|
|
|
|
+ ner_ = selffool.self_ner(process_sentences)
|
|
|
|
+ else:
|
|
|
|
+ ner_ = selffool.ner(process_sentences)
|
|
|
|
+ print('ner_ :',ner_)
|
|
|
|
+ for i in range(len(ner_)):
|
|
|
|
+ the_index = process_data[i][0]
|
|
|
|
+ ners[the_index] = ner_[i]
|
|
|
|
+ sents = []
|
|
|
|
+ for i in range(len(sentences)):
|
|
|
|
+ sents.append([i,sentences[i]])
|
|
|
|
+ sents.sort(key=lambda x:len(x[1]),reverse=True)
|
|
|
|
+ print(sents)
|
|
|
|
+ index_ = 0
|
|
|
|
+ ners = [[]for i in range(len(sentences))]
|
|
|
|
+
|
|
|
|
+ while(True):
|
|
|
|
+ width = len(sents[index_][1])
|
|
|
|
+ height = MAXAREA//width+1
|
|
|
|
+ if height>len(sents)-index_:
|
|
|
|
+ height = len(sents)-index_
|
|
|
|
+ process_data = sents[index_:index_+height]
|
|
|
|
+ getData( ners, process_data)
|
|
|
|
+ index_ += height
|
|
|
|
+ if index_>=len(sents):
|
|
|
|
+ break
|
|
|
|
+ return ners
|
|
|
|
+# 网页公告处理
|
|
|
|
+def get_article1(articles,cost_time = dict(),useselffool=True):
|
|
|
|
+ '''
|
|
|
|
+ :param articles: 待处理的article source html
|
|
|
|
+ :param useselffool: 是否使用selffool
|
|
|
|
+ :return: list_articles
|
|
|
|
+ '''
|
|
|
|
+
|
|
|
|
+ list_articles = []
|
|
|
|
+ for article in articles:
|
|
|
|
+ a_time = time.time()
|
|
|
|
+ sourceContent = article
|
|
|
|
+ #表格处理
|
|
|
|
+ key_preprocess = "tableToText"
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
|
|
|
|
+
|
|
|
|
+ # log(article_processed)
|
|
|
|
+
|
|
|
|
+ if key_preprocess not in cost_time:
|
|
|
|
+ cost_time[key_preprocess] = 0
|
|
|
|
+ cost_time[key_preprocess] += time.time()-start_time
|
|
|
|
+
|
|
|
|
+ #article_processed = article[1]
|
|
|
|
+ list_articles.append(article_processed)
|
|
|
|
+ print(time.time()-a_time)
|
|
|
|
+ return list_articles
|
|
|
|
+# 分句处理
|
|
|
|
+def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
|
|
|
|
+ '''
|
|
|
|
+
|
|
|
|
+ :param list_articles: 经过预处理的article text
|
|
|
|
+ :return: list_sentences
|
|
|
|
+ '''
|
|
|
|
+
|
|
|
|
+ list_sentences = []
|
|
|
|
+ for article in list_articles:
|
|
|
|
+ a_time = time.time()
|
|
|
|
+ list_sentences_temp = []
|
|
|
|
+ #表格处理
|
|
|
|
+ key_preprocess = "tableToText"
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ article_processed = article
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ if key_preprocess not in cost_time:
|
|
|
|
+ cost_time[key_preprocess] = 0
|
|
|
|
+ cost_time[key_preprocess] += time.time()-start_time
|
|
|
|
+
|
|
|
|
+ #nlp处理
|
|
|
|
+ if article_processed is not None and len(article_processed)!=0:
|
|
|
|
+ split_patten = "。"
|
|
|
|
+ sentences = []
|
|
|
|
+ _begin = 0
|
|
|
|
+ sentences_set = set()
|
|
|
|
+ for _iter in re.finditer(split_patten,article_processed):
|
|
|
|
+ _sen = article_processed[_begin:_iter.span()[1]]
|
|
|
|
+ if len(_sen)>0 and _sen not in sentences_set:
|
|
|
|
+ sentences.append(_sen)
|
|
|
|
+ sentences_set.add(_sen)
|
|
|
|
+ _begin = _iter.span()[1]
|
|
|
|
+ _sen = article_processed[_begin:]
|
|
|
|
+ if len(_sen)>0 and _sen not in sentences_set:
|
|
|
|
+ sentences.append(_sen)
|
|
|
|
+ sentences_set.add(_sen)
|
|
|
|
+ # article = "".join(sentences)
|
|
|
|
+ # # sentences.append(article_processed[_begin:])
|
|
|
|
+ #
|
|
|
|
+ # lemmas = []
|
|
|
|
+ # doc_offsets = []
|
|
|
|
+ # dep_types = []
|
|
|
|
+ # dep_tokens = []
|
|
|
|
+ #
|
|
|
|
+ # time1 = time.time()
|
|
|
|
+
|
|
|
|
+ '''
|
|
|
|
+ tokens_all = fool.cut(sentences)
|
|
|
|
+ #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
|
|
|
|
+ #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
|
|
|
|
+ ner_entitys_all = fool.ner(sentences)
|
|
|
|
+ '''
|
|
|
|
+ #限流执行
|
|
|
|
+ key_nerToken = "nerToken"
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ # tokens_all = getTokens(sentences,useselffool=useselffool)
|
|
|
|
+ if key_nerToken not in cost_time:
|
|
|
|
+ cost_time[key_nerToken] = 0
|
|
|
|
+ cost_time[key_nerToken] += time.time()-start_time
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ for sentence_index in range(len(sentences)):
|
|
|
|
+
|
|
|
|
+ sentence_text = sentences[sentence_index]
|
|
|
|
+ # tokens = tokens_all[sentence_index]
|
|
|
|
+ #
|
|
|
|
+ # #pos_tag = pos_all[sentence_index]
|
|
|
|
+ # pos_tag = ""
|
|
|
|
+ #
|
|
|
|
+ # ner_entitys = ""
|
|
|
|
+
|
|
|
|
+ list_sentences_temp.append(sentence_text)
|
|
|
|
+
|
|
|
|
+ if len(list_sentences_temp)==0:
|
|
|
|
+ list_sentences_temp.append(sentence_text)
|
|
|
|
+ list_sentences.append(list_sentences_temp)
|
|
|
|
+ print('2:',time.time()-a_time)
|
|
|
|
+ return list_sentences
|
|
|
|
+
|
|
|
|
+def ronghe():
|
|
|
|
+ a = ",投诉处理决定书,投诉人:福建光正工程项目管理有限公司,联系地址:福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室,被投诉人:泉州台商投资区城市建设发展有限公司,泉州台商投资区水务投资经营有限公司,福建省富诚工程管理有限公司,联系地址:泉州台商投资区通港路大创商厦,一、投诉人投诉事项,投诉人按中标候选人公示的要求参加会议,由于提供的身份证原件于复印件版本不同而被废标,认为废标理由不成立。"
|
|
|
|
+ ners = [(13, 28, 'company', '福建光正工程项目管理有限公司'), (33, 75, 'location', '福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室'), (80, 98, 'company', '泉州台商投资区城市建设发展有限公司'), (98, 116, 'company', '泉州台商投资区水务投资经营有限公司'), (116, 130, 'company', '福建省富诚工程管理有限公司'), (135, 150, 'location', '泉州台商投资区通港路大创商厦')]
|
|
|
|
+ s = ['person', 'org', 'company', 'union']
|
|
|
|
+ remove_num = 0
|
|
|
|
+ for i in range(len(ners)):
|
|
|
|
+ print(0)
|
|
|
|
+ ner = ners[i]
|
|
|
|
+ begin = ner[0]
|
|
|
|
+ end = ner[1]
|
|
|
|
+ type = ner[2]
|
|
|
|
+
|
|
|
|
+ if type in s:
|
|
|
|
+ if end == ners[i+1][0] and a[end-1]=='、':
|
|
|
|
+ print(1)
|
|
|
|
+ new_begin = begin
|
|
|
|
+ new_end = ners[i+1][1]
|
|
|
|
+ new_type = 'union'
|
|
|
|
+ new_text = ner[3]+'、'+ners[i+1][3]
|
|
|
|
+ new_ner = (new_begin,new_end,new_type,new_text)
|
|
|
|
+ ners[i] = 0
|
|
|
|
+ ners[i+1] = new_ner
|
|
|
|
+ remove_num += 1
|
|
|
|
+ continue
|
|
|
|
+ if end == ners[i + 1][0] and a[end-1] == ',' and a[ners[i + 1][1]-1]==a[end-1]:
|
|
|
|
+ print(2)
|
|
|
|
+ new_begin = begin
|
|
|
|
+ new_end = ners[i + 1][1]
|
|
|
|
+ new_type = 'union'
|
|
|
|
+ new_text = ner[3] + ',' + ners[i + 1][3]
|
|
|
|
+ new_ner = (new_begin, new_end, new_type, new_text)
|
|
|
|
+ ners[i] = 0
|
|
|
|
+ ners[i + 1] = new_ner
|
|
|
|
+ remove_num += 1
|
|
|
|
+
|
|
|
|
+ for i in range(remove_num):
|
|
|
|
+ ners.remove(0)
|
|
|
|
+ print(ners)
|
|
|
|
+
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
+ # get_data1()
|
|
|
|
+ # get_ners()
|
|
|
|
+ # test02()
|
|
|
|
+ # get_unionNers()
|
|
|
|
+ # 投诉人、被投诉/处罚人
|
|
|
|
+ # get_complainant()
|
|
|
|
+ # ronghe()
|
|
|
|
+ # 分类
|
|
|
|
+ # textClassify()
|
|
|
|
+ # 投诉是否成立、处罚决定(投诉)
|
|
|
|
+ # get_punishWhether01()
|
|
|
|
+ # 处罚决定(处罚)
|
|
|
|
+ # get_punishDecision()
|
|
|
|
+ # 执法机构、处罚时间
|
|
|
|
+ get_institution()
|
|
|
|
+ # 处罚类型
|
|
|
|
+ # get_punishType()
|
|
|
|
+
|
|
|
|
+ pass
|