punish_predictor.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # @Author : bidikeji
  4. # @Time : 2021/1/25 0025 16:35
  5. #!/usr/bin/python3
  6. # -*- coding: utf-8 -*-
  7. # @Author : bidikeji
  8. # @Time : 2020/12/24 0024 15:23
  9. import re
  10. import os
  11. import time
  12. import tensorflow as tf
  13. from BiddingKG.dl.common.Utils import *
  14. from BiddingKG.dl.common.nerUtils import *
  15. from keras.preprocessing.sequence import pad_sequences
  16. def decode(logits, trans, sequence_lengths, tag_num):
  17. viterbi_sequences = []
  18. for logit, length in zip(logits, sequence_lengths):
  19. score = logit[:length]
  20. viterbi_seq, viterbi_score = viterbi_decode(score, trans)
  21. viterbi_sequences.append(viterbi_seq)
  22. return viterbi_sequences
  23. class Punish_Extract():
  24. def __init__(self, model_file = os.path.dirname(__file__)+"/models/punish_code.pb"):
  25. print('model_file_path:',model_file)
  26. self.sess = tf.Session(graph=tf.Graph())
  27. self.code = ""
  28. self.punish_dicition = ""
  29. self.model_file = model_file #预测编号模型
  30. self.load_model()
  31. # 加载处罚编号预测模型
  32. def load_model(self):
  33. log("get model of time")
  34. with self.sess.as_default():
  35. with self.sess.graph.as_default():
  36. output_graph_def = tf.GraphDef()
  37. with open(self.model_file, 'rb') as f:
  38. output_graph_def.ParseFromString(f.read())
  39. tf.import_graph_def(output_graph_def, name="")
  40. self.sess.run(tf.global_variables_initializer())
  41. self.char_input = self.sess.graph.get_tensor_by_name("char_input:0")
  42. self.length = self.sess.graph.get_tensor_by_name("length:0")
  43. self.trans = self.sess.graph.get_tensor_by_name("crf_loss/transitons:0")
  44. self.logits = self.sess.graph.get_tensor_by_name("CRF/output/logits:0")
  45. # 处罚编号预测
  46. def predict_punishCode(self,list_sentences, MAX_AREA=5000):
  47. '''
  48. 每个句子预测处罚编号
  49. :param list_sentences: 多篇文章句子列表[[每篇文章句子列表]]
  50. :param MAX_AREA: 控制最大每个句子长度,超过截断
  51. :return: 处罚编号字符串,若有多个;号隔开
  52. '''
  53. re_ner = re.compile("12+?3")
  54. article_ner_list = []
  55. count = 0
  56. with self.sess.as_default():
  57. with self.sess.graph.as_default():
  58. for sentences in list_sentences:
  59. count += 1
  60. # print(count)
  61. sentences.sort(key=lambda x: len(x.sentence_text), reverse=True)
  62. _begin_index = 0
  63. while True:
  64. MAX_LEN = len(sentences[_begin_index].sentence_text)
  65. if MAX_LEN > MAX_AREA:
  66. MAX_LEN = MAX_AREA
  67. _LEN = MAX_AREA // MAX_LEN
  68. # sentence_len = [len(sentence.sentence_text) for sentence in sentences[_begin_index:_begin_index+_LEN]]
  69. sentence_len = [len(sentence.sentence_text) if len(sentence.sentence_text)<=MAX_LEN else MAX_LEN for sentence in sentences[_begin_index:_begin_index+_LEN]]
  70. sentences_x = []
  71. for sentence in sentences[_begin_index:_begin_index+_LEN]:
  72. sentence = sentence.sentence_text
  73. sentence = list(sentence)
  74. sentence2id = [getIndexOfWord(word) for word in sentence]
  75. sentences_x.append(sentence2id)
  76. sentences_x = pad_sequences(sentences_x, maxlen=MAX_LEN, padding="post", truncating="post")
  77. sentences_x = [np.array(x) for x in sentences_x]
  78. _logits, _trans = self.sess.run([self.logits, self.trans],
  79. feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
  80. viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
  81. ner_list = []
  82. for _seq, sentence in zip(viterbi_sequence, sentences[_begin_index:_begin_index+_LEN]):
  83. sentence = sentence.sentence_text
  84. seq_id = ''.join([str(s) for s in _seq])
  85. if re_ner.search(seq_id):
  86. # print("sentence: ",sentence)
  87. for _ner in re_ner.finditer(seq_id):
  88. start = _ner.start()
  89. end = _ner.end()
  90. n = sentence[start:end]
  91. # print(n,'<==>',start,end)
  92. # ner_list.append((n, start, end))
  93. ner_list.append(n) # 改为只返回实体字符
  94. # article_ner_list.append(ner_list)
  95. article_ner_list.append(';'.join(set(ner_list)))
  96. if _begin_index+_LEN >= len(sentences):
  97. break
  98. _begin_index += _LEN
  99. return article_ner_list[0]
  100. # 处罚类型
  101. def get_punishType(self, x1, x2):
  102. '''通过文章标题及内容判断文章类别
  103. x1: 标题
  104. x2: 内容
  105. return 类别'''
  106. # x1 = x1.replace('(','(').replace(')', ')').replace(' ','')
  107. # x2 = x2.replace('(', '(').replace(')', ')').replace(' ', '')
  108. '''标题正则'''
  109. # 未知公告
  110. unknow = re.compile('采购方式|采购公告|采购招标|磋商公告|谈判公告|交易公告$|征集|征求|招标公告|竞标公告|中标公告|'
  111. '成交公告|成交信息|流标公告|废标公告|城市管理考评|决算表|决算|预算|资格考试|招聘|选聘'
  112. '|聘请|拟录用|无违规违法|无此项信息|暂无工程投标违法|管理办法|指导意见|无投诉|投诉办法'
  113. '公共资源交易情况|绩效评价|考试成绩|付息公告|不动产|办证|印发|转发') #|结果公示 部分是
  114. # 投诉处理
  115. tscl = re.compile('投诉不予[处受]理|投诉不成立|终止投诉|投诉终止|不予受理|投诉事?项?的?处理')
  116. # 行政处罚
  117. xzcf = re.compile('行政处罚|行政处理|政处罚|行政裁决|防罚|公罚|医罚|环罚|政罚|文罚|局罚|旅罚|财罚|运罚')
  118. # 监督检查
  119. jdjc = re.compile('(监督检查的?问?题?(处理|整改|记分|结果|决定|处罚))|监督处罚|调查处理|监督处理')
  120. # 严重违法
  121. yzwf = re.compile('严重违法失信|黑名单|失信名单')
  122. # 不良行为
  123. blxw = re.compile('((不良|失信|不诚信|差错|不规范|违规|违约|处罚|违法)(行为|记录|信息))|((违约|违规|违法)(处理|操作|情况|问题))'
  124. '|通报批评|记分管理|迟到|早退|缺席|虚假材料|弄虚作假|履职不到位|诚信考核扣分|串通投标'
  125. '|审核不通过|码一致|地址一致|扣分处理|扣分通知|扣[0-9]+分|责令整改|信用信息认定书$'
  126. '|关于.{,30}的处罚|关于.{,10}的?考评通报|关于.{,30}扣分情况|不规范代理行为'
  127. '|(取消|暂停|限制).{,50}((专家|评标|评委|投标|竞价|被抽取|中标|供应商|候选人)资格)'
  128. '|(代理服?务?机构).{,10}(扣分)|(专家).{,30}(扣分|记分|处罚)|对.{,30}处理|冻结.{,30}账号')
  129. # 其他不良行为
  130. other = re.compile('质疑|代理机构进场交易情况|网上投诉办理|信用奖惩|信用奖罚|进场工作.{,5}考核'
  131. '|举报处理|结果无效|成交无效|行政复议')
  132. '''正文内容正则'''
  133. # 投诉处理
  134. tscl_c = re.compile('(投诉(人|单位)[1-9]?(名称)?[::])|(投诉事项[1-5一二三四五、]*部?分?(成立|予以受理))'
  135. '|((驳回|撤回|撤销|终止)[^,。]{,60}(投诉|质疑))')
  136. # 行政处罚
  137. xzcf_c = re.compile('((处理依据及结果|处理结果|处罚结果)).*行政处罚|如下行政处罚|行政处罚决定')
  138. # 诚信加分
  139. cxjf_c = re.compile('处罚结果.*诚信加分')
  140. # 严重违法失信
  141. yzwf_c = re.compile('工商部门严重违法失信起名单|严重违法失信的具体情形') #|严重违法失信的具体情形
  142. # 不良行为
  143. blxw_c = re.compile('(取消|暂停|限制).{,30}((专家|评标|评委|投标|采购|竞价|被抽取|中标|供应商)的?资格)'
  144. '|(处罚结果|处罚情况).*(扣[1-9]*分|记分|不良行为|不良记录|不良信用|不诚信|扣除信用'
  145. '|诚信档案|信用信息|取消.*资格|口头警告|处罚机关|责令改正|罚款|限制投标|暂扣|禁止'
  146. '|暂停|封禁|暂无|行政处罚)|处罚结果'
  147. '|处罚主题|禁止参与.{,10}政府采购活动|列入不良行为|处罚如下|如下处罚|违规处罚|处罚违规'
  148. '|责令改正|责令整改|处罚依据|进行以下处理|处理依据及结果|处理结果|处罚决定书|'
  149. '(不规范|不良|不诚信)行为记录')
  150. # 其他不良行为
  151. other_c = re.compile('质疑(人|单位)[1-9]?(名称)?:|公告期内受质疑')
  152. if re.search(unknow, x1):
  153. return re.search(unknow, x1).group(0), '未知类别'
  154. elif re.search(yzwf, x1):
  155. return re.search(yzwf, x1).group(0), '严重违法'
  156. elif re.search(yzwf_c, x2):
  157. return re.search(yzwf_c, x2).group(0), '严重违法'
  158. elif re.search(tscl, x1):
  159. return re.search(tscl, x1).group(0), '投诉处理'
  160. elif re.search(xzcf, x1):
  161. return re.search(xzcf, x1).group(0), '行政处罚'
  162. elif re.search(jdjc, x1):
  163. return re.search(jdjc, x1).group(0), '监督检查'
  164. elif re.search(blxw, x1):
  165. return re.search(blxw, x1).group(0), '不良行为'
  166. elif re.search(other, x1):
  167. return re.search(other, x1).group(0), '其他不良行为'
  168. elif re.search(tscl_c, x2):
  169. return re.search(tscl_c, x2).group(0), '投诉处理'
  170. elif re.search(xzcf_c, x2):
  171. return re.search(xzcf_c, x2).group(0), '行政处罚'
  172. elif re.search(cxjf_c, x2):
  173. return re.search(cxjf_c, x2).group(0), '诚信加分'
  174. elif re.search(blxw_c, x2):
  175. return re.search(blxw_c, x2).group(0), '不良行为'
  176. elif re.search(other_c, x2):
  177. return re.search(other_c, x2).group(0), '其他不良行为'
  178. return ' ', '未知类别'
  179. # 处罚决定
  180. def get_punishDecision(self, x, x2):
  181. '''通过正则匹配文章内容中的处理决定
  182. x:正文内容
  183. x2: 处罚类别
  184. return 处理决定字符串'''
  185. rule1 = re.compile(
  186. '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处理意见|行政处罚|处罚)(如下|如下))'
  187. '|((以下|如下)(决定|处理|处理意见|行政处罚|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
  188. '|整改意见)[::].{5,}')
  189. rule2 = re.compile(
  190. '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处罚|处理意见)(如下|如下))'
  191. '|((以下|如下)(决定|处理|处理意见|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
  192. '|处罚内容)[:,,].{10,}')
  193. rule3 = re.compile('考评结果:?.*')
  194. rule4 = re.compile('(依据|根据)《.*》.*')
  195. if x2 == '未知类别':
  196. return ' '
  197. elif re.search(rule1, x[-int(len(x)*0.4):]):
  198. return re.search(rule1, x[-int(len(x)*0.4):]).group(0)
  199. elif re.search(rule1, x[-int(len(x)*0.6):]):
  200. return re.search(rule1, x[-int(len(x)*0.6):]).group(0)
  201. elif re.search(rule2, x[-int(len(x)*0.7):]):
  202. return re.search(rule2, x[-int(len(x)*0.7):]).group(0)
  203. elif re.search(rule3, x[-int(len(x)*0.6):]):
  204. return re.search(rule3, x[-int(len(x)*0.6):]).group(0)
  205. elif re.search(rule4, x[-int(len(x)*0.4):]):
  206. return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
  207. else:
  208. return ' '
  209. # 投诉是否成立
  210. def get_punishWhether(self, x1, x2, x3):
  211. '''通过正则匹配处理决定判断投诉是否成立
  212. x1: 处理决定字符串
  213. x2: 正文内容
  214. x3: 处罚类别
  215. return 投诉是否成立'''
  216. p1 = re.compile('(投诉|投拆|质疑|举报)(事项|内容|事实)?[^不,。]{,10}(成立|属实|予以受理|予以支持)|责令|废标|(中标|成交)[^,。]{,10}无效'
  217. '|取消[^,。]{,60}资格|罚款|重新(组织|开展)?(招标|采购)|投诉成立|被投诉人存在违法违规行为'
  218. '|采购活动违法|(中标|评标|成交)结果无效')
  219. p2 = re.compile('投诉不予[处受]理|((投诉|投拆|质疑|举报)(事项|内容|事实)?[^,。]{,10}(不成立|情?况?不属实|不予支持|缺乏事实依据))'
  220. '|((驳回|撤回|撤销|终止)[^,。]*(投诉|质疑|诉求))|终止[^,。]{,20}(行政裁决|投诉处理|采购活动)|投诉终止|投诉无效'
  221. '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^,。]{,10}不成立'
  222. '|维持被投诉人|不支持[^,。]{,20}投诉|无确凿证据')
  223. if x3 != '投诉处理':
  224. return ' '
  225. elif re.search(p1, x1):
  226. return '投诉成立'
  227. elif re.search(p2, x1):
  228. return '投诉无效'
  229. elif re.search(p1, x2):
  230. return '投诉成立'
  231. elif re.search(p2, x2):
  232. return '投诉无效'
  233. return ' '
  234. # 执法机构、处罚时间
  235. def get_institution(self, title, sentences_l, entity_l):
  236. '''
  237. 通过判断实体前信息判断改实体是否为执法机构
  238. :param title: 文章标题
  239. :param sentences_l: 单篇公告句子列表
  240. :param entity_l: 单篇公告实体列表
  241. :return: 执法机构及处罚时间字符串,多个的用;号隔开
  242. '''
  243. institutions = []
  244. punishTimes = []
  245. institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
  246. punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
  247. # 通过实体前面关键词判断是否为执法机构或处罚时间
  248. for ner in entity_l:
  249. if ner.entity_type == 'org':
  250. left = sentences_l[ner.sentence_index].sentence_text[
  251. max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
  252. if institution_1.search(left):
  253. institutions.append(ner)
  254. elif institutions != [] and ner.sentence_index == institutions[-1].sentence_index and \
  255. ner.wordOffset_begin - institutions[-1].wordOffset_end < 2 and \
  256. sentences_l[ner.sentence_index].sentence_text[
  257. ner.wordOffset_begin:institutions[-1].wordOffset_end] \
  258. in ['', '、', '和', '及']:
  259. institutions.append(ner)
  260. elif ner.entity_type == 'time':
  261. left = sentences_l[ner.sentence_index].sentence_text[
  262. max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
  263. if punishTimes_1.search(left):
  264. punishTimes.append(ner)
  265. institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
  266. institution_time = re.compile(
  267. "(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
  268. ins = ""
  269. ptime = ""
  270. # 如果前面步骤找不到处罚机构则在标题找实体,并正则检查是否有关键词
  271. if institutions == [] and len(title)>10:
  272. title_ners = getNers([title], useselffool=True)
  273. if title_ners[0]:
  274. for title_ner in title_ners[0]:
  275. if title_ner[2] == 'org' and institution_title.search(title_ner[3]):
  276. ins = title_ner[3]
  277. break
  278. if punishTimes == [] or institutions == []:
  279. # 如果前面步骤还没找到要素,则通过公司实体后面是否有日期关键词,有则作为处罚机构和处罚时间
  280. for ner in [ner for ner in entity_l if ner.entity_type == 'org'][-5:][::-1]:
  281. right = sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_end:ner.wordOffset_end + 16]
  282. if institution_time.search(right):
  283. if ins == '':
  284. ins = ner.entity_text
  285. if ptime == '':
  286. ptime = institution_time.search(right).group(1)
  287. break
  288. # 前面步骤都没找到则判断最后一个时间实体是否在文章末尾,是则作为处罚时间
  289. if ptime == '':
  290. n_time = [ner for ner in entity_l if ner.entity_type == 'time']
  291. if len(n_time) != 0:
  292. ner = n_time[-1]
  293. if ner.sentence_index == len(sentences_l) - 1:
  294. textLong = len(sentences_l[ner.sentence_index].sentence_text)
  295. if ner.wordOffset_end > textLong - 3 and len(ner.entity_text) > 3:
  296. ptime = ner.entity_text
  297. institutions = [ner.entity_text for ner in institutions]
  298. punishTimes = [ner.entity_text for ner in punishTimes]
  299. if institutions == [] and ins != "":
  300. institutions.append(ins)
  301. if punishTimes == [] and ptime != "":
  302. punishTimes.append(ptime)
  303. return ";".join(institutions), ";".join(punishTimes)
  304. # 投诉人、被投诉人、被处罚人
  305. def get_complainant(self, punishType, sentences_l, entity_l):
  306. '''
  307. 通过对公告类别、句子列表、实体列表正则寻找投诉人、被投诉人、处罚人
  308. :param punishType: 公告处罚类别
  309. :param sentences_l: 单篇公告句子列表
  310. :param entity_l: 单篇公告实体列表
  311. :return: 投诉人、被投诉人
  312. '''
  313. complainants = [] # 投诉人
  314. punishPeople = [] # 被投诉人、被处罚人
  315. size = 16
  316. # 投诉人、质疑人
  317. complainants_rule1 = re.compile(
  318. "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
  319. # 被处罚人,被投诉人
  320. punishPeople_rule1 = re.compile(
  321. "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
  322. punishPeople_rule2_1 = re.compile(",$")
  323. punishPeople_rule2_2 = re.compile("^[::]")
  324. punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
  325. punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
  326. punish_l = [] # 处罚实体列表
  327. tmp = []
  328. for ner in [ner for ner in entity_l if ner.entity_type in ['org', 'company', 'person']]:
  329. if tmp == []:
  330. tmp.append(ner)
  331. elif ner.entity_type == tmp[-1].entity_type and ner.sentence_index == tmp[-1].sentence_index and \
  332. ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
  333. and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
  334. '',
  335. '、',
  336. '和',
  337. '及']:
  338. tmp.append(ner)
  339. elif ner.entity_type in ['org', 'company'] and tmp[-1].entity_type in ['org', 'company'] and \
  340. ner.sentence_index == tmp[-1].sentence_index and ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
  341. and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
  342. '',
  343. '、',
  344. '和',
  345. '及']:
  346. tmp.append(ner)
  347. else:
  348. punish_l.append(tmp)
  349. tmp = [ner]
  350. for ner_l in punish_l:
  351. begin_index = ner_l[0].wordOffset_begin
  352. end_index = ner_l[-1].wordOffset_end
  353. left = sentences_l[ner_l[0].sentence_index].sentence_text[max(0, begin_index - size):begin_index]
  354. right = sentences_l[ner_l[0].sentence_index].sentence_text[end_index:end_index + size]
  355. if complainants_rule1.search(left):
  356. complainants.append(ner_l)
  357. elif punishPeople_rule1.search(left):
  358. punishPeople.append(ner_l)
  359. elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
  360. if punishType == '投诉处理':
  361. complainants.append(ner_l)
  362. else:
  363. punishPeople.append(ner_l)
  364. elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
  365. punishPeople.append(ner_l)
  366. complainants = set([it.entity_text for l in complainants for it in l])
  367. punishPeople = set([it.entity_text for l in punishPeople for it in l])
  368. return ';'.join(complainants), ';'.join(punishPeople)
  369. def get_punish_extracts(self,list_articles,list_sentences, list_entitys):
  370. list_result = []
  371. for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
  372. title = article.title
  373. text=article.content
  374. keyword, punishType = self.get_punishType(title, text)
  375. if punishType == "未知类别":
  376. punishType = ""
  377. # print('处罚类型:',punishType)
  378. punish_code = self.predict_punishCode(list_sentences)
  379. # print('处罚编号: ',punish_code)
  380. institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
  381. # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
  382. punishDecision = self.get_punishDecision(text, punishType)
  383. # print('处罚决定:',punishDecision)
  384. punishWhether= self.get_punishWhether(punishDecision, text, punishType)
  385. # print('投诉是否成立:',punishWhether)
  386. complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
  387. # print('投诉人:%s 被投诉人:%s'%(complainants, punishPeople))
  388. punish_dic = {'punish_code':punish_code,
  389. 'punishType':punishType,
  390. 'punishDecision':punishDecision,
  391. 'complainants':complainants,
  392. 'punishPeople':punishPeople,
  393. 'punishWhether':punishWhether,
  394. 'institutions':institutions,
  395. 'punishTimes':punishTimes}
  396. return {k: v for k, v in punish_dic.items() if v not in ['', ' ']}
  397. if __name__ == "__main__":
  398. punish = Punish_Extract()
  399. import pandas as pd
  400. # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
  401. # df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
  402. # i = 89
  403. # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
  404. # i = 92
  405. # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
  406. # t1 = time.time()
  407. # for i in df.index:
  408. # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
  409. # get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
  410. # df.loc[i, '投诉人'] = complainants
  411. # df.loc[i, '被投诉人'] = punishPeople
  412. # df.loc[i, '执法机构'] = institutions
  413. # df.loc[i, '处罚时间'] = punishTimes
  414. # df.loc[i, '处罚编号'] = punish_code
  415. # print('完成第%d篇'%i)
  416. # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
  417. # # '关键词', '类别', '处理决定', '投诉是否成立',
  418. # # 'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
  419. # # 'institution', 'punishTime', 'ner_test']])
  420. # t2 = time.time()
  421. # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
  422. # # '关键词', '类别', '处理决定', '投诉是否成立',
  423. # # 'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
  424. # # 'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
  425. # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
  426. # '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
  427. # 'DETAILLINK', 'sentences', 'PAGE_TIME'])
  428. # t3 = time.time()
  429. # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
  430. # s = '''兰州铁路公安局台式计算机比价采购成交公告 比价单号: BJ21012710021020 比价单名称: 兰州铁路公安局台式计算机比价采购 报价截止时间: 2021-01-29 00:00:00 采购单位: 兰州铁路公安局 采购内容: 台式计算机 采购数量: 1 成交供应商名称: 北京未来长盛科技有限公司 成交金额(元): 4896.00 供应商报价详情 序号 是否成交 供应商名称 企业性质 报价商品名称 单价(元) 总报价(元) 排名价(元) 报价时间 1 成交 北京未来长盛科技有限公司 台式计算机HP ProOne 400 G6 V802105105A I3/8G/1T/2G/23.8寸一体机 4896.00 4896.00 4896.00 2021-01-28 17:36:03 2 北京中际远华科贸有限公司 联想(Lenovo)启天M62G-D008/I5-9500/8G/1T/2G独显/刻录光驱/win10政府版/21.5寸/三年质保/商用台式计算机 4899.00 4899.00 4899.00 2021-01-27 17:51:48 3 金典高科(北京)科技有限公司 联想(Lenovo)启天M62G-D008/I5-9500/8G/1T/2G独显/刻录光驱/win10政府版/21.5寸/三年质保/商用台式计算机 4938.00 4938.00 4938.00 2021-01-28 17:36:07 4 福建兄弟文仪企业服务股份有限公司 联想(Lenovo)启天M62G-D008/I5-9500/8G/1T/2G独显/刻录光驱/win10政府版/21.5寸/三年质保/商用台式计算机 4950.00 4950.00 4950.00 2021-01-28 13:56:11 5 北京华信泰博科技有限公司 联想(Lenovo)启天M62G-D008/I5-9500/8G/1T/2G独显/刻录光驱/win10政府版/21.5寸/三年质保/商用台式计算机 4985.00 4985.00 4985.00 2021-01-27 17:22:43 6 北京思科派电子产品有限公司 联想(Lenovo)启天M62G-D008/I5-9500/8G/1T/2G独显/刻录光驱/win10政府版/21.5寸/三年质保/商用台式计算机 4989.00 4989.00 4989.00 2021-01-27 18:07:55 7 北京伟豪基业信息科技有限公司 联想(Lenovo)启天M62G-D008/I5-9500/8G/1T/2G独显/刻录光驱/win10政府版/21.5寸/三年质保/商用台式计算机 4999.00 4999.00 4999.00 2021-01-27 16:57:23 8 兰州亿佳科技有限公司 联想(Lenovo)启天M62G-D008/I5-9500/8G/1T/2G独显/刻录光驱/win10政府版/21.5寸/三年质保/商用台式计算机 4999.00 4999.00 4999.00 2021-01-28 14:52:45 9 北京瀚海浩达科技有限公司 联想(Lenovo)启天M62G-D008/I5-9500/8G/1T/2G独显/刻录光驱/win10政府版/21.5寸/三年质保/商用台式计算机 4999.00 4999.00 4999.00 2021-01-28 17:50:20 比价规则: 1.有比价单列表中商品在售的供应商可参与相应比价单的比价。 2.比价截止后,系统自动确定比价结果。 3.有效报价不足三家,系统自动废标。 4.有效报价满足三家,将自动确定最低价成交。 5.如成交供应商超过72小时未确认订单,视为该供应商放弃成交,可顺延至第二名成交。 6.小微企业、残疾人福利性单位、监狱企业将享受国家规定的价格优惠支持政策。 7.生产厂商和销售商同时满足价格支持政策方可享受价格优惠。
  431. # '''
  432. #
  433. # list_sentences = [s.split('。')]
  434. # punish_code= punish.predict_punishCode( list_sentences)
  435. # print(punish_code)
  436. # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
  437. # get_punish_extracts(text=s)
  438. # punish_dic = punish.get_punish_extracts(text=s)
  439. # print(punish_dic)