test1.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
  1. import sys
  2. import os
  3. sys.path.append(os.path.abspath("../.."))
  4. import pandas as pd
  5. import re
  6. from BiddingKG.dl.common.Utils import *
  7. from BiddingKG.dl.interface.Entitys import *
  8. from BiddingKG.dl.interface.predictor import *
  9. from BiddingKG.dl.foolnltk import selffool
  10. from BiddingKG.dl.interface.Preprocessing import *
  11. def get_data1():
  12. load1 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_TOU_SU_CHU_LI.csv")
  13. load2 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_WEI_FA_JI_LU.csv")
  14. load3 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_QI_TA_SHI_XIN.csv")
  15. load = pd.concat([load1, load2, load3], axis=0)
  16. load = load.reset_index(drop=True)
  17. load['PAGE_CONTENT'] = get_article1(load['PAGE_CONTENT'])
  18. sentences_list = get_sentences1(load['PAGE_CONTENT'])
  19. load['sentences'] = ['*#*>'.join(_sentences) for _sentences in sentences_list ]
  20. load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv")
  21. def get_ners():
  22. data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
  23. # data = data.head(3)
  24. nersList = []
  25. for index,_sentences in zip(data.index,data['sentences']):
  26. _sentences = _sentences.split('*#*>')
  27. _ners = getNers(_sentences,useselffool=True)
  28. word_index = 0
  29. for ners,sentence in zip(_ners, _sentences):
  30. if len(ners) != 0:
  31. word_ner_list = ['O']*len(sentence)
  32. for ner in ners:
  33. nerDict = dict()
  34. entity_type = ner[2]
  35. nerDict['entity_type'] = entity_type
  36. entity_text = ner[3]
  37. nerDict['entity_text'] = entity_text
  38. begin_index = ner[0]
  39. nerDict['begin_index'] = begin_index
  40. end_index = ner[1] - 1
  41. nerDict['end_index'] = end_index
  42. wordOffset_begin = word_index + begin_index
  43. nerDict['wordOffset_begin'] = wordOffset_begin
  44. wordOffset_end = wordOffset_begin + len(entity_text)
  45. nerDict['wordOffset_end'] = wordOffset_end
  46. nerDict['sentence'] = sentence
  47. nerDict['article_index'] = index
  48. # print('====')
  49. # print(begin_index,end_index,entity_type,entity_text)
  50. nersList.append(nerDict)
  51. # print(nerDict)
  52. word_ner_list[begin_index] = 'B'
  53. word_ner_list[begin_index+1:end_index] = ['I']*(end_index-begin_index-1)
  54. word_index += len(sentence)
  55. # save(nersList,"nersList.pk")
  56. # 相邻的(org、company)(person)合并
  57. def get_unionNers():
  58. data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
  59. ners = load("nersList.pk")
  60. org_companys = [[] for _ in range(len(data))]
  61. type1 = ['org', 'company', 'union_oc']
  62. persons = [[] for _ in range(len(data))]
  63. type2 = ['person', 'union_person']
  64. for ner in ners:
  65. if ner['entity_type'] in type1:
  66. org_companys[ner['article_index']].append(ner)
  67. if ner['entity_type'] in type2:
  68. persons[ner['article_index']].append(ner)
  69. # 合并 org 和 company
  70. new_org_companys = []
  71. for org_company in org_companys:
  72. if org_company and len(org_company) > 1:
  73. union_nums = 0
  74. for i in range(len(org_company)-1):
  75. if org_company[i]['end_index'] == org_company[i + 1]['begin_index'] - 1 and org_company[i]['sentence'][org_company[i]['end_index']] == '、' \
  76. and org_company[i]['sentence'] == org_company[i + 1]['sentence']:
  77. # print(1)
  78. org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
  79. org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
  80. org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
  81. # print(org_company[i + 1]['entity_text'])
  82. org_company[i] = 0
  83. union_nums += 1
  84. elif org_company[i]['end_index'] == org_company[i + 1]['begin_index'] and org_company[i]['sentence'] == org_company[i+1]['sentence']:
  85. # print(2)
  86. org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
  87. org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
  88. org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
  89. # print(org_company[i + 1]['entity_text'])
  90. org_company[i] = 0
  91. union_nums += 1
  92. for _ in range(union_nums):
  93. org_company.remove(0)
  94. new_org_companys.append(org_company)
  95. # 合并person
  96. new_persons = []
  97. for person in persons:
  98. if person and len(person) > 1:
  99. union_nums = 0
  100. for i in range(len(person) - 1):
  101. if person[i]['end_index'] == person[i + 1]['begin_index'] - 1 and person[i]['sentence'][person[i]['end_index']] == '、' \
  102. and person[i]['sentence'] == person[i + 1]['sentence']:
  103. # print(1)
  104. person[i + 1]['begin_index'] = person[i]['begin_index']
  105. person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
  106. person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
  107. # print(person[i + 1]['entity_text'])
  108. person[i] = 0
  109. union_nums += 1
  110. elif person[i]['end_index'] == person[i + 1]['begin_index'] and person[i]['sentence'] == person[i + 1]['sentence']:
  111. # print(2)
  112. person[i + 1]['begin_index'] = person[i]['begin_index']
  113. person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
  114. person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
  115. # print(person[i + 1]['entity_text'])
  116. person[i] = 0
  117. union_nums += 1
  118. for _ in range(union_nums):
  119. person.remove(0)
  120. new_persons.append(person)
  121. # save([new_org_companys,new_persons],"unionNers.pk")
  122. def test02():
  123. load = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
  124. text_rule = re.compile("监管调查|通报|不诚信|监督检查|不良|投诉|质疑|处罚|违法|违规|不予[受处]理|处理")
  125. title_rule = re.compile("中标公告|中标[(\(]成交[\))]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
  126. "|补贴公[示告]|废标公[示告]")
  127. # need_index = []
  128. # for index, title, text in zip(load.index, load['PAGE_TITLE'], load['PAGE_CONTENT']):
  129. # a = 0
  130. # if text_rule.search(text):
  131. # a = 1
  132. # if title_rule.search(title):
  133. # a = 0
  134. # if text_rule.search(title):
  135. # a = 1
  136. # if a:
  137. # need_index.append(index)
  138. # print(len(need_index))
  139. # load = load.loc[need_index]
  140. # print(len(load))
  141. # load = load.reset_index(drop=True)
  142. complainants_rule1 = re.compile("[^被]投[诉拆][人方]之?[\d一二三四五六七八九十]?(?:(.+?))?[::]+?")
  143. complaint_rule = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|疑问[人方]|检举[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?名称)?[::]+")
  144. complainants_list = []
  145. a = 1
  146. load = load[9744:9745]
  147. for article,sentences in zip(load['PAGE_CONTENT'],load['sentences']):
  148. print(a)
  149. a+=1
  150. getSentences = sentences.split('*#*>')
  151. # print(getSentences)
  152. ners = getNers(getSentences,useselffool=True)
  153. print(ners)
  154. print('======================')
  155. word_index = 0
  156. ners_list = []
  157. for ner,sentence in zip(ners,getSentences):
  158. size = 16
  159. complainants = []
  160. if len(ner)!=0:
  161. for aner in ner:
  162. entity_type = aner[2]
  163. entity_text = aner[3]
  164. # begin = word_index + aner[0]
  165. # end = begin + len(entity_text)
  166. # 投诉人
  167. if entity_type in ['org','company','person']:
  168. left = sentence[max(0, aner[0] - size):aner[0]]
  169. print(entity_text,left,sentence)
  170. if complaint_rule.search(left):
  171. print('yes')
  172. entity_type = 'complainant'
  173. complainants.append(entity_text)
  174. # ners_list.append([begin, end, entity_type, entity_text])
  175. word_index += len(sentence)
  176. complainants_list.append(complainants)
  177. # test
  178. # for i in ners_list:
  179. # print(i[3])
  180. # print(processed[0][i[0]:i[1]])
  181. load['complainant'] = complainants_list
  182. # load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\test01.csv")
  183. # 投诉人、被投诉人、被处罚人
  184. def get_complainant():
  185. data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2.xlsx",index_col=0)
  186. # ners = load("nersList.pk")
  187. unionNers = load("unionNers.pk")
  188. ners = [i+j for i,j in zip(unionNers[0],unionNers[1])]
  189. complainants = [[] for _ in range(len(data))]
  190. punishPeople = [[] for _ in range(len(data))]
  191. a = ['org','company','person']
  192. size = 16
  193. # 投诉人、质疑人
  194. complainants_rule1 = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
  195. # 被处罚人,被投诉人
  196. punishPeople_rule1 = re.compile("(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
  197. punishPeople_rule2_1 = re.compile(",$")
  198. punishPeople_rule2_2 = re.compile("^[::]")
  199. punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
  200. punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
  201. time1 = time.time()
  202. for _ner in ners:
  203. if _ner:
  204. for ner in _ner:
  205. left = ner['sentence'][max(0,ner['begin_index']-size):ner['begin_index']]
  206. right = ner['sentence'][ner['end_index']:min(ner['end_index']+size,len(ner['sentence']))]
  207. # print(left)
  208. if complainants_rule1.search(left):
  209. complainants[ner['article_index']].append(ner['entity_text'])
  210. elif punishPeople_rule1.search(left):
  211. punishPeople[ner['article_index']].append(ner['entity_text'])
  212. elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
  213. if data['类别'][ner['article_index']] == '投诉处理':
  214. complainants[ner['article_index']].append(ner['entity_text'])
  215. else:
  216. punishPeople[ner['article_index']].append(ner['entity_text'])
  217. elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
  218. punishPeople[ner['article_index']].append(ner['entity_text'])
  219. data['complainant'] = complainants
  220. data['punishPeople'] = punishPeople
  221. print(time.time()-time1)
  222. data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx")
  223. def get_complainant2(list_sentences, list_entitys, text_type):
  224. '''
  225. list_sentences: get_preprocessed() 得list_sentences
  226. list_entitys: get_preprocessed() 得list_entitys
  227. text_type: 文章类别(处罚类型)
  228. :return:
  229. complainants :投诉人列表
  230. punishPeople: 被投诉人/被处罚人
  231. '''
  232. sentences_list = list_sentences
  233. entitys_list = list_entitys
  234. size = 16
  235. a = ['org', 'company', 'person']
  236. b = ['org', 'company', 'union_org_company']
  237. c = ['person', 'union_person']
  238. need_entitys = []
  239. for entity in entitys_list:
  240. if entity.entity_type in a:
  241. need_entitys.append(entity)
  242. # 实体合并
  243. drop_count = 0
  244. for i in range(1, len(need_entitys)):
  245. entity = need_entitys[i]
  246. entity_begin = entity.wordOffset_begin
  247. entity_end = entity.wordOffset_end
  248. sentence = sentences_list[entity.sentence_index].sentence_text
  249. last_entity = need_entitys[i - 1]
  250. if entity.sentence_index == last_entity.sentence_index:
  251. if (entity.entity_type in b and last_entity.entity_type in b) or (
  252. entity.entity_type in c and last_entity.entity_type in c):
  253. if entity_begin - last_entity.wordOffset_end < 2 and sentence[
  254. last_entity.wordOffset_end:entity_begin] in ['',
  255. '、',
  256. '和',
  257. '及']:
  258. need_entitys[i].wordOffset_begin = last_entity.wordOffset_begin
  259. need_entitys[i].begin_index = last_entity.begin_index
  260. need_entitys[i].entity_text = last_entity.entity_text + '+' + entity.entity_text
  261. if entity.entity_type in b:
  262. need_entitys[i].entity_type = 'union_org_company'
  263. else:
  264. need_entitys[i].entity_type = 'union_person'
  265. need_entitys[i - 1] = 0
  266. drop_count += 1
  267. for _ in range(drop_count):
  268. need_entitys.remove(0)
  269. # 投诉人、质疑人
  270. complainants_rule1 = re.compile(
  271. "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
  272. # 被处罚人,被投诉人
  273. punishPeople_rule1 = re.compile(
  274. "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
  275. punishPeople_rule2_1 = re.compile(",$")
  276. punishPeople_rule2_2 = re.compile("^[::]")
  277. punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
  278. punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
  279. complainants = []
  280. punishPeople = []
  281. for i in range(len(need_entitys)):
  282. entity = need_entitys[i]
  283. entity_begin = entity.wordOffset_begin
  284. entity_end = entity.wordOffset_end
  285. # entity所在句子
  286. sentence = sentences_list[entity.sentence_index].sentence_text
  287. left = sentence[max(0, entity_begin - size):entity_begin]
  288. right = sentence[entity_end:min(entity_end + size, len(sentence))]
  289. if complainants_rule1.search(left):
  290. complainants.append(entity)
  291. elif punishPeople_rule1.search(left):
  292. punishPeople.append(entity)
  293. elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
  294. if text_type == '投诉处理':
  295. complainants.append(entity)
  296. else:
  297. punishPeople.append(entity)
  298. elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
  299. punishPeople.append(entity)
  300. result_complainants = []
  301. result_punishPeople = []
  302. for entity in complainants:
  303. if entity.entity_type in ['union_org_company', 'union_person']:
  304. entity_text = entity.entity_text.split('+')
  305. for item in entity_text:
  306. result_complainants.append(item)
  307. else:
  308. result_complainants.append(entity.entity_text)
  309. for entity in punishPeople:
  310. if entity.entity_type in ['union_org_company', 'union_person']:
  311. entity_text = entity.entity_text.split('+')
  312. for item in entity_text:
  313. result_punishPeople.append(item)
  314. else:
  315. result_punishPeople.append(entity.entity_text)
  316. return list(set(result_complainants)), list(set(result_punishPeople))
  317. # 公告分类
  318. def textClassify():
  319. data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
  320. #投诉人|检举人|举报人|质疑人|质疑函
  321. patten1 = "投诉人|检举人|举报人|质疑人|质疑函|投诉处理|质疑单位"
  322. re1 = re.compile(patten1)
  323. patten2 = "不予[处受]理|撤诉|撤[销回]投诉|投诉终止"
  324. re2 = re.compile(patten2)
  325. patten3 = "关于[^,。]+?(?:处罚|通报|处理意见)|被处罚人|处罚决定|限制行为开始时间|处罚执行部门"
  326. re3 = re.compile(patten3)
  327. patten4 = "不良行为|不良信用|不良记录|不规范行为|不诚信行为"
  328. re4 = re.compile(patten4)
  329. patten5 = "行政处罚|行政处理|监督检查|监管调查|监督处理|违规处[罚理]|违法处[罚理]"
  330. re5 = re.compile(patten5)
  331. patten6 = "严重违法失信起名单|严重违法失信行为|严重违法失信企业"
  332. re6 = re.compile(patten6)
  333. patten7 = '处理决定'
  334. re7 = re.compile(patten7)
  335. patten8 = "处[理罚]依据|处罚日期|扣分依据|认定依据"
  336. re8 = re.compile(patten8)
  337. pos = []
  338. _type = []
  339. for title,text in zip(data['PAGE_TITLE'],data["PAGE_CONTENT"]):
  340. p = []
  341. t = ''
  342. if re1.search(text) or re1.search(title):
  343. p.append(patten1)
  344. t = '投诉'
  345. elif re2.search(text) and re.search('投诉',text):
  346. p.append('投诉+'+patten2)
  347. t = '投诉'
  348. elif re.search("回复",title):
  349. p.append("回复")
  350. t = '投诉'
  351. if len(p)==0:
  352. if re3.search(title) or re3.search(text):
  353. p.append(patten3)
  354. t = '处罚'
  355. elif re4.search(title):
  356. p.append(patten4)
  357. t = '处罚'
  358. elif re5.search(title) or re5.search(text):
  359. p.append(patten5)
  360. t = '处罚'
  361. elif re6.search(text) or re6.search(title):
  362. p.append(patten6)
  363. t = '处罚'
  364. elif re8.search(text):
  365. p.append(patten8)
  366. t = '处罚'
  367. if len(p) == 0:
  368. if re7.search(text) and re.search('投诉', text):
  369. p.append('投诉+' + patten7)
  370. t = '投诉'
  371. elif re7.search(text) or re7.search(title):
  372. p.append("处罚+"+patten7)
  373. t = '处罚'
  374. pos.append(p)
  375. _type.append(t)
  376. data['pos'] = pos
  377. data['type'] = _type
  378. data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv")
  379. # 投诉是否成立
  380. def get_punishWhether01():
  381. data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv",index_col=0)
  382. data = data[data['type']=='投诉']
  383. punishWhether_1 = re.compile("投诉[^。,,不]+?成立|投诉[^。,,]*[^不]属实|情况[^。,,]*[^不]属实|投诉成立|情况属实|予以支持")
  384. punishWhether_0 = re.compile("投诉[^。,,]*不能?成立|撤诉|[^逾将]{4,}不予[受处]理|撤[回销][^。,,]*(?:举报|投诉)|驳回[^。,,]*投诉|投诉终止|终止[^。,,]*投诉|情况[^。,,]*不属实|投诉[^。,,]*不属实|缺乏事实依据|不予支持|予以驳回")
  385. punishWhether = []
  386. punishDecision = []
  387. punishDecision_1 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[::]((?:(?:[\d一二三四五六七八九十]|[\((][\d一二三四五六七八九十][\))]|投[诉拆]事项[\d一二三四五六七八九十]).+?。)+)")
  388. punishDecision_2 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[::]([^。]+?(?:。|$))")
  389. punishDecision_3 = re.compile("[\d一二三四五六七八九十]、(?:处理,?意见|[裁决|处理]依据及结果|处理(?:决定|结果)|投诉处理决定),(.+?)。[\d一二三四五六七八九十]、")
  390. punishDecision_4 = re.compile("(?:[\d一二三四五六七八九十]、处理,?意见|综上所述|[裁决|处理]依据及结果|综上|[\d一二三四五六七八九十]、处理(?:决定|结果)|经研究决定|[\d一二三四五六七八九十]、投诉处理决定),([^。]+?(?:。|$))")
  391. punishDecision_5 = re.compile("(本机关决定|本机关认为|经审查.+?(?:。|$))")
  392. punishDecision_6 = re.compile("((?:依据|按照|根据|依照)[^::。].+?(?:。|$))")
  393. def findDecision(text):
  394. decision = ''
  395. if punishDecision_1.search(text):
  396. decision = punishDecision_1.search(text).group(1)
  397. elif punishDecision_2.search(text):
  398. decision = punishDecision_2.search(text).group(1)
  399. elif punishDecision_3.search(text):
  400. decision = punishDecision_3.search(text).group(1)
  401. elif punishDecision_4.search(text):
  402. decision = punishDecision_4.findall(text)
  403. decision = decision[-1]
  404. elif punishDecision_5.search(text):
  405. decision = punishDecision_5.search(text).group(1)
  406. elif punishDecision_6.search(text):
  407. decision = punishDecision_6.findall(text)
  408. decision1 = decision[-1]
  409. if re.search("诉讼",decision1) and len(decision)>1:
  410. decision1 = decision[-2]
  411. decision = decision1
  412. return decision
  413. for text in data['PAGE_CONTENT']:
  414. pw = ''
  415. if punishWhether_1.search(text):
  416. pw = 1
  417. elif punishWhether_0.search(text):
  418. pw = 0
  419. punishWhether.append(pw)
  420. mid = len(text)//2
  421. lower_half = text[mid:]
  422. decision = findDecision(lower_half)
  423. if decision == '':
  424. decision = findDecision(text)
  425. # if punishDecision_1.search(text):
  426. # decision = punishDecision_1.search(text).group(1)
  427. #
  428. # elif punishDecision_2.search(text):
  429. # decision = punishDecision_2.search(text).group(1)
  430. # elif punishDecision_3.search(text):
  431. # decision = punishDecision_3.search(text).group(1)
  432. # elif punishDecision_4.search(text):
  433. # decision = punishDecision_4.findall(text)
  434. # decision = decision[-1]
  435. # elif punishDecision_5.search(text):
  436. # decision = punishDecision_5.findall(text)
  437. # decision = decision[-1]
  438. punishDecision.append(decision)
  439. data['punishWhether'] = punishWhether
  440. data['punishDecision'] = punishDecision
  441. data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishWhether&Decision.csv")
  442. # 处罚决定
  443. def get_punishDecision():
  444. data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv", index_col=0)
  445. data = data[data['type'] == '处罚']
  446. punishDecision_1 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[::]+((?:(?:[\d一二三四五六七八九十]|[\((][\d一二三四五六七八九十][\))]).+?。)+)")
  447. punishDecision_2 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[::]+(.+?(?:。|$))")
  448. punishDecision_3 = re.compile("(扣分分?值[::][\d.]+分?)")
  449. punishDecision_4 = re.compile("[\d一二三四五六七八九十]、(?:处理结果|处理决定|处理依据[和及]处理结果|处理依据及结果|处罚决定|处罚结果|整改意见),(.+?)。[\d一二三四五六七八九十]、")
  450. punishDecision_5 = re.compile("(?:处理结果|[\d一二三四五六七八九十]、处理决定|处理依据及处理结果|处理依据及结果|经研究|经研究决定|[\d一二三四五六七八九十]、处罚决定|处罚结果|整改意见),+(.+?(?:。|$))")
  451. punishDecision_6 = re.compile("(?:本机关决定|我局决定)(.+?(?:。|$))")
  452. punishDecision_7 = re.compile("((?:依据|按照|根据|依照)[^::。].+?(?:。|$))")
  453. punishDecision = []
  454. for text in data['PAGE_CONTENT']:
  455. decision = ''
  456. if punishDecision_1.search(text):
  457. decision = punishDecision_1.search(text).group(1)
  458. elif punishDecision_2.search(text):
  459. decision = punishDecision_2.search(text).group(1)
  460. elif punishDecision_3.search(text):
  461. decision = punishDecision_3.search(text).group(1)
  462. elif punishDecision_4.search(text):
  463. decision = punishDecision_4.search(text).group(1)
  464. elif punishDecision_5.search(text):
  465. decision = punishDecision_5.findall(text)
  466. decision = decision[-1]
  467. elif punishDecision_6.search(text):
  468. decision = punishDecision_6.search(text).group(1)
  469. elif punishDecision_7.search(text):
  470. decision = punishDecision_7.findall(text)
  471. decision = decision[-1]
  472. punishDecision.append(decision)
  473. data['punishDecision'] = punishDecision
  474. data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishDecision处罚.csv")
  475. # 执法机构、处罚时间
  476. def get_institution():
  477. data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx", index_col=0)
  478. ners = load("nersList.pk")
  479. orgs = [[] for _ in range(len(data))]
  480. times = [[] for _ in range(len(data))]
  481. institutions = [[] for _ in range(len(data))]
  482. punishTimes = [[] for _ in range(len(data))]
  483. institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
  484. punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
  485. for ner in ners:
  486. if ner['entity_type'] == 'org':
  487. left = ner['sentence'][max(0,ner['begin_index']-15):ner['begin_index']]
  488. if institution_1.search(left):
  489. institutions[ner['article_index']].append(ner['entity_text'])
  490. orgs[ner['article_index']].append(ner)
  491. elif ner['entity_type'] =='time':
  492. left = ner['sentence'][max(0, ner['begin_index'] - 15):ner['begin_index']]
  493. if punishTimes_1.search(left):
  494. punishTimes[ner['article_index']].append(ner['entity_text'])
  495. times[ner['article_index']].append(ner)
  496. orgs = [org[-5:] if len(org)>5 else org for org in orgs]
  497. times = [time[-3:] if len(time)>3 else time for time in times]
  498. data['org'] = orgs
  499. data['time'] = times
  500. data['institution'] = institutions
  501. data['punishTime'] = punishTimes
  502. # data = data[data['type'].isin(["投诉","处罚"])]
  503. print(len(data))
  504. # data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv")
  505. # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv", index_col=0)
  506. institution_list = []
  507. punishTime_list = []
  508. institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
  509. institution_time = re.compile("(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
  510. for title,text,org,n_time,institution,punishTime in zip(data['PAGE_TITLE'],data['PAGE_CONTENT'],data['org'],data['time'],data['institution'],data['punishTime']):
  511. ins = ''
  512. ptime = ''
  513. if punishTime:
  514. ptime = punishTime
  515. if institution:
  516. ins = institution
  517. else:
  518. title_ners = getNers([title],useselffool=True)
  519. if title_ners[0]:
  520. for title_ner in title_ners[0]:
  521. if title_ner[2]=='org' and institution_title.search(title_ner[3]):
  522. # 'title:'+
  523. ins = title_ner[3]
  524. # print(title_ner[3])
  525. break
  526. # if ins == '':
  527. for _org in org[::-1]:
  528. right = _org['sentence'][_org['end_index']:min(len(_org['sentence']),_org['end_index']+16)]
  529. if institution_time.search(right):
  530. if ins == '':
  531. # "text_EndWithTime:" +
  532. ins = _org['entity_text']
  533. if ptime == '':
  534. # "text_EndWithIns:" +
  535. ptime =institution_time.search(right).group(1)
  536. break
  537. if ptime == '' and len(n_time) != 0:
  538. textLong = len(text)
  539. if n_time[-1]['wordOffset_end'] > textLong-3 and len(n_time[-1]['entity_text'])>3:
  540. # "EndOfText:" +
  541. ptime = n_time[-1]['entity_text']
  542. institution_list.append(ins)
  543. punishTime_list.append(ptime)
  544. data['institution'] = institution_list
  545. data['punishTime'] = punishTime_list
  546. data = data.drop(columns=['org','time'],axis=1)
  547. data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-2.xlsx")
  548. # 处罚类型
  549. def get_punishType():
  550. data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
  551. # 暂定:严重违法失信,行政处罚,投诉处理,监督检查,其他失信记录
  552. # 其他无关公告
  553. title_rule = re.compile("(?:中标公[示告]|中标[(\(]成交[\))]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
  554. "|补贴公[示告]|废标公[示告]|备案公[示告]|数据统计|选取公告|流标公告|变更公告|入围公告|征集公告|执行情况|"
  555. "登记公告|竞争性磋商公告|报名的公[示告]|竞争性谈判公告|邀请函|竞标公告|采购公告|招标公告|议标公告|预审公告|"
  556. "询价公告|竞争性磋商(磋商)公告|竞[谈价]公告|合同公告|人员(名单)?公示|批复|终止公告|入围结果公告|中标结果公[示告]|"
  557. "意见公示)(?:[\((].+?[\))])?$|关于.*通知(?:[^书]|$)")
  558. othertype = "其他无关公告"
  559. # 投诉处理
  560. re1_1 = re.compile("投诉[人方]|检举人|举报人[::]|投诉处理|终止投诉|投诉终止|撤诉|撤回投诉|质疑人|质疑单位|质疑[^,,。]*答复")
  561. re1_2 = re.compile("处理决定|回复")
  562. re1_type = '投诉处理'
  563. # 监督检查
  564. re2 = re.compile("监督检查|监管调查|监督处理")
  565. re2_type = "监督检查"
  566. # 行政处罚
  567. re3 = re.compile("行政处罚|行政处理")
  568. re3_type = "行政处罚"
  569. # 严重违法失信
  570. re4 = re.compile("严重违法失信行为|严重违法失信企业|严重违法失信起名单")
  571. re4_type = "严重违法失信"
  572. # 其他失信公告
  573. re_other = re.compile("关于[^,。]+?(?:处罚|处理|通报)|不良行为|不良信用|不良记录|不规范行为|不诚信行为|"
  574. "违[规法约]处[罚理]|处[理罚]依据|处罚日期|扣分依据|认定依据|处罚决定|违规情况|"
  575. "违[规法]行为|违规事项|考评依据|失信行为")
  576. re_otherType = "其他失信公告"
  577. punishType_list = []
  578. for title,text in zip(data['PAGE_TITLE'],data['PAGE_CONTENT']):
  579. punishType = ''
  580. titleWithText = title + text
  581. if title_rule.search(title):
  582. punishType = othertype
  583. elif re1_1.search(titleWithText) or re.search("投[诉拆]",title):
  584. punishType = re1_type
  585. elif re1_2.search(titleWithText) and re.search("投诉",titleWithText):
  586. punishType = re1_type
  587. elif re2.search(titleWithText):
  588. punishType = re2_type
  589. elif re3.search(titleWithText):
  590. punishType = re3_type
  591. elif re4.search(titleWithText):
  592. punishType = re4_type
  593. elif re_other.search(titleWithText) or re.search("处罚",title):
  594. punishType = re_otherType
  595. punishType_list.append(punishType)
  596. data['punishType'] = punishType_list
  597. data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishType_test.csv",encoding='utf-8')
  598. def getNers_my(sentences,MAXAREA = 10000,useselffool=False):
  599. '''
  600. @param: sentences:句子数
  601. @return 限流执行后的实体识别list
  602. '''
  603. def getData(ners,process_data):
  604. process_sentences = [item[1] for item in process_data]
  605. print(process_data)
  606. if useselffool:
  607. ner_ = selffool.self_ner(process_sentences)
  608. else:
  609. ner_ = selffool.ner(process_sentences)
  610. print('ner_ :',ner_)
  611. for i in range(len(ner_)):
  612. the_index = process_data[i][0]
  613. ners[the_index] = ner_[i]
  614. sents = []
  615. for i in range(len(sentences)):
  616. sents.append([i,sentences[i]])
  617. sents.sort(key=lambda x:len(x[1]),reverse=True)
  618. print(sents)
  619. index_ = 0
  620. ners = [[]for i in range(len(sentences))]
  621. while(True):
  622. width = len(sents[index_][1])
  623. height = MAXAREA//width+1
  624. if height>len(sents)-index_:
  625. height = len(sents)-index_
  626. process_data = sents[index_:index_+height]
  627. getData( ners, process_data)
  628. index_ += height
  629. if index_>=len(sents):
  630. break
  631. return ners
  632. # 网页公告处理
  633. def get_article1(articles,cost_time = dict(),useselffool=True):
  634. '''
  635. :param articles: 待处理的article source html
  636. :param useselffool: 是否使用selffool
  637. :return: list_articles
  638. '''
  639. list_articles = []
  640. for article in articles:
  641. a_time = time.time()
  642. sourceContent = article
  643. #表格处理
  644. key_preprocess = "tableToText"
  645. start_time = time.time()
  646. article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
  647. # log(article_processed)
  648. if key_preprocess not in cost_time:
  649. cost_time[key_preprocess] = 0
  650. cost_time[key_preprocess] += time.time()-start_time
  651. #article_processed = article[1]
  652. list_articles.append(article_processed)
  653. print(time.time()-a_time)
  654. return list_articles
  655. # 分句处理
  656. def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
  657. '''
  658. :param list_articles: 经过预处理的article text
  659. :return: list_sentences
  660. '''
  661. list_sentences = []
  662. for article in list_articles:
  663. a_time = time.time()
  664. list_sentences_temp = []
  665. #表格处理
  666. key_preprocess = "tableToText"
  667. start_time = time.time()
  668. article_processed = article
  669. if key_preprocess not in cost_time:
  670. cost_time[key_preprocess] = 0
  671. cost_time[key_preprocess] += time.time()-start_time
  672. #nlp处理
  673. if article_processed is not None and len(article_processed)!=0:
  674. split_patten = "。"
  675. sentences = []
  676. _begin = 0
  677. sentences_set = set()
  678. for _iter in re.finditer(split_patten,article_processed):
  679. _sen = article_processed[_begin:_iter.span()[1]]
  680. if len(_sen)>0 and _sen not in sentences_set:
  681. sentences.append(_sen)
  682. sentences_set.add(_sen)
  683. _begin = _iter.span()[1]
  684. _sen = article_processed[_begin:]
  685. if len(_sen)>0 and _sen not in sentences_set:
  686. sentences.append(_sen)
  687. sentences_set.add(_sen)
  688. # article = "".join(sentences)
  689. # # sentences.append(article_processed[_begin:])
  690. #
  691. # lemmas = []
  692. # doc_offsets = []
  693. # dep_types = []
  694. # dep_tokens = []
  695. #
  696. # time1 = time.time()
  697. '''
  698. tokens_all = fool.cut(sentences)
  699. #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
  700. #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
  701. ner_entitys_all = fool.ner(sentences)
  702. '''
  703. #限流执行
  704. key_nerToken = "nerToken"
  705. start_time = time.time()
  706. # tokens_all = getTokens(sentences,useselffool=useselffool)
  707. if key_nerToken not in cost_time:
  708. cost_time[key_nerToken] = 0
  709. cost_time[key_nerToken] += time.time()-start_time
  710. for sentence_index in range(len(sentences)):
  711. sentence_text = sentences[sentence_index]
  712. # tokens = tokens_all[sentence_index]
  713. #
  714. # #pos_tag = pos_all[sentence_index]
  715. # pos_tag = ""
  716. #
  717. # ner_entitys = ""
  718. list_sentences_temp.append(sentence_text)
  719. if len(list_sentences_temp)==0:
  720. list_sentences_temp.append(sentence_text)
  721. list_sentences.append(list_sentences_temp)
  722. print('2:',time.time()-a_time)
  723. return list_sentences
  724. def ronghe():
  725. a = ",投诉处理决定书,投诉人:福建光正工程项目管理有限公司,联系地址:福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室,被投诉人:泉州台商投资区城市建设发展有限公司,泉州台商投资区水务投资经营有限公司,福建省富诚工程管理有限公司,联系地址:泉州台商投资区通港路大创商厦,一、投诉人投诉事项,投诉人按中标候选人公示的要求参加会议,由于提供的身份证原件于复印件版本不同而被废标,认为废标理由不成立。"
  726. ners = [(13, 28, 'company', '福建光正工程项目管理有限公司'), (33, 75, 'location', '福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室'), (80, 98, 'company', '泉州台商投资区城市建设发展有限公司'), (98, 116, 'company', '泉州台商投资区水务投资经营有限公司'), (116, 130, 'company', '福建省富诚工程管理有限公司'), (135, 150, 'location', '泉州台商投资区通港路大创商厦')]
  727. s = ['person', 'org', 'company', 'union']
  728. remove_num = 0
  729. for i in range(len(ners)):
  730. print(0)
  731. ner = ners[i]
  732. begin = ner[0]
  733. end = ner[1]
  734. type = ner[2]
  735. if type in s:
  736. if end == ners[i+1][0] and a[end-1]=='、':
  737. print(1)
  738. new_begin = begin
  739. new_end = ners[i+1][1]
  740. new_type = 'union'
  741. new_text = ner[3]+'、'+ners[i+1][3]
  742. new_ner = (new_begin,new_end,new_type,new_text)
  743. ners[i] = 0
  744. ners[i+1] = new_ner
  745. remove_num += 1
  746. continue
  747. if end == ners[i + 1][0] and a[end-1] == ',' and a[ners[i + 1][1]-1]==a[end-1]:
  748. print(2)
  749. new_begin = begin
  750. new_end = ners[i + 1][1]
  751. new_type = 'union'
  752. new_text = ner[3] + ',' + ners[i + 1][3]
  753. new_ner = (new_begin, new_end, new_type, new_text)
  754. ners[i] = 0
  755. ners[i + 1] = new_ner
  756. remove_num += 1
  757. for i in range(remove_num):
  758. ners.remove(0)
  759. print(ners)
  760. if __name__ == '__main__':
  761. # get_data1()
  762. # get_ners()
  763. # test02()
  764. # get_unionNers()
  765. # 投诉人、被投诉/处罚人
  766. # get_complainant()
  767. # ronghe()
  768. # 分类
  769. # textClassify()
  770. # 投诉是否成立、处罚决定(投诉)
  771. # get_punishWhether01()
  772. # 处罚决定(处罚)
  773. # get_punishDecision()
  774. # 执法机构、处罚时间
  775. get_institution()
  776. # 处罚类型
  777. # get_punishType()
  778. pass