Преглед изворни кода

保留单个的空格以解决预处理中时间被分割的问题,联系人去重maxcompute代码

rogel пре 4 година
родитељ
комит
96e719b167

+ 4 - 1
BiddingKG/dl/common/Utils.py

@@ -134,7 +134,10 @@ def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024):
         while(_begin<len_sample):
             new_dict = dict()
             for _key in feed_dict.keys():
-                new_dict[_key] = feed_dict[_key][_begin:_begin+MAX_BATCH]
+                if isinstance(feed_dict[_key],(float,int,np.int32,np.float_,np.float16,np.float32,np.float64)):
+                    new_dict[_key] = feed_dict[_key]
+                else:
+                    new_dict[_key] = feed_dict[_key][_begin:_begin+MAX_BATCH]
             _output = sess.run(list_output,feed_dict=new_dict)
             for _index in range(len(list_output)):
                 list_result[_index].extend(_output[_index])

+ 34 - 29
BiddingKG/dl/complaint/punish_rule.py

@@ -143,7 +143,7 @@ class Punish_Extract():
                                 # ner_list.append((n, start, end))
                                 ner_list.append(n)  # 改为只返回实体字符
                     # article_ner_list.append(ner_list)
-                    article_ner_list.append(''.join(set(ner_list)))
+                    article_ner_list.append(';'.join(set(ner_list)))
         return article_ner_list[0]
 
     # 处罚类型
@@ -261,7 +261,7 @@ class Punish_Extract():
         elif re.search(rule4, x[-int(len(x)*0.4):]):
             return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
         else:
-            return ' '
+            return ''
 
     # 投诉是否成立
     def get_punishWhether(self, x1, x2, x3):
@@ -278,7 +278,7 @@ class Punish_Extract():
                         '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^,。]{,10}不成立'
                         '|维持被投诉人|不支持[^,。]{,20}投诉|无确凿证据')
         if x3 != '投诉处理':
-            return ' '
+            return ''
         elif re.search(p1, x1):
             return '投诉成立'
         elif re.search(p2, x1):
@@ -287,7 +287,7 @@ class Punish_Extract():
             return '投诉成立'
         elif re.search(p2, x2):
             return '投诉无效'
-        return ' '
+        return ''
 
     # 执法机构、处罚时间
     def get_institution(self, title, sentences_l, entity_l):
@@ -296,7 +296,7 @@ class Punish_Extract():
         :param title: 文章标题
         :param sentences_l: 单篇公告句子列表
         :param entity_l: 单篇公告实体列表
-        :return: 执法机构及处罚时间字符串,多个的用号隔开
+        :return: 执法机构及处罚时间字符串,多个的用;号隔开
         '''
         institutions = []
         punishTimes = []
@@ -359,7 +359,7 @@ class Punish_Extract():
             institutions.append(ins)
         if punishTimes == [] and ptime != "":
             punishTimes.append(ptime)
-        return ";".join(institutions), ";".join(punishTimes)
+        return ";".join(institutions), ";".join(punishTimes)
 
     # 投诉人、被投诉人、被处罚人
     def get_complainant(self, punishType, sentences_l, entity_l):
@@ -426,7 +426,7 @@ class Punish_Extract():
                 punishPeople.append(ner_l)
         complainants = set([it.entity_text for l in complainants for it in l])
         punishPeople = set([it.entity_text for l in punishPeople for it in l])
-        return ';'.join(complainants), ';'.join(punishPeople)
+        return ';'.join(complainants), ';'.join(punishPeople)
 
     def get_punish_extracts_backup(self, doc_id=' ', title=' ', text=' '):
         list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
@@ -459,30 +459,35 @@ class Punish_Extract():
         for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
             title = article.title
             text=article.content
+
             keyword, punishType = self.get_punishType(title, text)
-            if punishType == "未知类别":
-                list_result.append({"punish":{}})
-            else:
-                # print('处罚类型:',punishType)
-                punish_code = self.predict_punishCode(list_sentences)
-                # print('处罚编号: ',punish_code)
-                institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
-                # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
-                punishDecision = self.get_punishDecision(text, punishType)
-                # print('处罚决定:',punishDecision)
-                punishWhether= self.get_punishWhether(punishDecision, text, punishType)
-                # print('投诉是否成立:',punishWhether)
-                complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
-                # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
-                punish_dic = {'punish_code':punish_code,
-                              'punishType':punishType,
-                              'punishDecision':punishDecision,
-                             'complainants':complainants,
-                             'punishPeople':punishPeople,
-                             'punishWhether':punishWhether,
-                             'institutions':institutions,
-                             'punishTimes':punishTimes}
+            # print('处罚类型:',punishType)
+            punish_code = self.predict_punishCode(list_sentences)
+            # print('处罚编号: ',punish_code)
+            institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
+            # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
+            punishDecision = self.get_punishDecision(text, punishType)
+            # print('处罚决定:',punishDecision)
+            punishWhether= self.get_punishWhether(punishDecision, text, punishType)
+            # print('投诉是否成立:',punishWhether)
+            complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
+            # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
+            punish_dic = {'punish_code':punish_code,
+                          'punishType':punishType,
+                          'punishDecision':punishDecision,
+                         'complainants':complainants,
+                         'punishPeople':punishPeople,
+                         'punishWhether':punishWhether,
+                         'institutions':institutions,
+                         'punishTimes':punishTimes}
+            _count = 0
+            for k,v in punish_dic.items():
+                if v!="":
+                    _count += 1
+            if _count>=2 and punish_dic["punishType"]!="未知类别":
                 list_result.append({"punish":punish_dic})
+            else:
+                list_result.append({"punish":{}})
         return list_result
 
 if __name__ == "__main__":

+ 1 - 1
BiddingKG/dl/interface/predictor.py

@@ -223,7 +223,7 @@ class CodeNamePredict():
             list_entitys = [[] for _ in range(len(list_sentences))]
         for list_sentence,list_entity in zip(list_sentences,list_entitys):
             if len(list_sentence)==0:
-                result.append([list_sentence[0].doc_id,{"code":[],"name":""}])
+                result.append([{"code":[],"name":""}])
                 continue
             doc_id = list_sentence[0].doc_id
             # sentences = []

+ 12 - 12
BiddingKG/dl/test/test4.py

@@ -114,7 +114,7 @@ def test(name,content):
 if __name__=="__main__":
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\admin\\Desktop\\新建文本文档 (2).txt","r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
@@ -134,18 +134,18 @@ if __name__=="__main__":
     # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
     # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
     a = time.time()
-    text = '''
-    ,清远市清新区治理道路货物运输车辆非法超限超载工作领导小组清远市清新区治理道路货物运输车辆非法超限超载工作领导小组喷墨打印机网上商城合同
-    验收报告,一、合同编号:GDMALL2019123563,。二、合同名称:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组喷墨打印机网上商城合同。
-    三、中标、成交供应商:广州爱联科技有限公司,地址:广州市黄埔大道西468号勤建商务大厦14层。联系人:周勇联系电话:020-85180120,。
-    四、合同金额(元):¥3,270.00,。五、合同详细信息:。采购项目编号::441827-201910-531001-0013,中标/成交标的名称::喷墨打印机,
-    数量::1台。采购项目名称::喷墨打印机,规格型号::WF-7218,中标/成交金额(元)::3,270.00。服务要求::,。,。六、验收结论:已通过。
-    七、验收小组成员名单::。八、联系事项:。(一)采购人:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组,地址:太和镇玄真路49号。
-    联系人:苏美彩,联系电话:0763-5835988,。(二)采购代理机构:地址::。联系人:联系电话::。附件::。
-    发布人:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组。发布时间:2019年11月26日
-    '''
+    # text = '''
+    # ,清远市清新区治理道路货物运输车辆非法超限超载工作领导小组清远市清新区治理道路货物运输车辆非法超限超载工作领导小组喷墨打印机网上商城合同
+    # 验收报告,一、合同编号:GDMALL2019123563,。二、合同名称:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组喷墨打印机网上商城合同。
+    # 三、中标、成交供应商:广州爱联科技有限公司,地址:广州市黄埔大道西468号勤建商务大厦14层。联系人:周勇联系电话:020-85180120,。
+    # 四、合同金额(元):¥3,270.00,。五、合同详细信息:。采购项目编号::441827-201910-531001-0013,中标/成交标的名称::喷墨打印机,
+    # 数量::1台。采购项目名称::喷墨打印机,规格型号::WF-7218,中标/成交金额(元)::3,270.00。服务要求::,。,。六、验收结论:已通过。
+    # 七、验收小组成员名单::。八、联系事项:。(一)采购人:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组,地址:太和镇玄真路49号。
+    # 联系人:苏美彩,联系电话:0763-5835988,。(二)采购代理机构:地址::。联系人:联系电话::。附件::。
+    # 发布人:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组。发布时间:2019年11月26日
+    # '''
     print("start")
-    print(predict("12",text,"重庆市綦江区人民法院关于重庆市綦江区文龙街道沙溪路22号银海新城六期45号楼、46号楼、47号楼负一层213号车位(第一次拍卖)的公告"))
+    print(predict("12",content))
     # print(predict("投诉处理公告", text))
     #test("12",text)
     print("takes",time.time()-a)