Przeglądaj źródła

解决上下文无法识别为招标人却判断为招标人的问题

rogel 4 lat temu
rodzic
commit
e56022842c

+ 19 - 5
BiddingKG/dl/interface/Preprocessing.py

@@ -419,10 +419,10 @@ def tableToText(soup):
                 inner_table[_h][_w][1] = 0
 
 
-        # print("=====")
-        # for item in inner_table:
-        #     print(item)
-        # print("======")
+        print("=====")
+        for item in inner_table:
+            print(item)
+        print("======")
 
         repairTable(inner_table)
         head_list = sliceTable(inner_table)
@@ -639,6 +639,11 @@ def tableToText(soup):
             head_end = head_list[head_i+1]
                 
             direct = getDirect(inner_table, head_begin, head_end)
+
+            print("----")
+            print(inner_table[head_begin:head_end])
+            print("head_end-head_begin",head_end-head_begin)
+            print(direct)
             
             #若只有一行,则直接按行读取
             if head_end-head_begin==1:
@@ -657,7 +662,16 @@ def tableToText(soup):
                     text_line = text_line+"。" if text_line!="" else text_line
                 text += text_line
             else:
-        
+                #构建一个共现矩阵
+                table_occurence = []
+                for i in range(head_begin,head_end):
+                    line_oc = []
+                    for j in range(width):
+                        cell = inner_table[i][j]
+                        line_oc.append({"text":cell[0],"type":cell[1]})
+                    table_occurence.append(line_oc)
+
+
                 if direct=="row":
                     for i in range(head_begin,head_end):
                         pack_text = ""

+ 1 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -834,7 +834,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                         if str(entity_before.label)=="1":
                             addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
                             #add pointer_money
-                            entity_before.pointer_money = entity_after
+                            entity_before.pointer_money = entity_money
                         break
                     p_entity_money -= 1
 

+ 2 - 2
BiddingKG/dl/interface/predictor.py

@@ -833,9 +833,9 @@ class FormPredictor():
         else:
             return self.getModel(type).predict(form_datas)
     
-    
+
 #角色规则
-#依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率    
+#依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
 class RoleRulePredictor():
     
     def __init__(self):

+ 3 - 3
BiddingKG/dl/test/test4.py

@@ -68,8 +68,8 @@ class MyEncoder(json.JSONEncoder):
         return json.JSONEncoder.default(self, obj)
 
 
-def predict(doc_id,text):
-    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
+def predict(doc_id,text,title=""):
+    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
     for articles in list_articles:
         print(articles.content)
 
@@ -138,7 +138,7 @@ if __name__=="__main__":
     # ,光大证券统一认证系统服务器硬件设备更新项目中标候选人公示,项目名称:光大证券统一认证系统服务器硬件设备更新项目,招标编号:CG-202011-030-001,公告日期:2020年12月3日,评标日期:2020年11月30日13时32分,评标地点:光大证券集中采购管理平台,推荐中标候选人:上海致为信息技术有限公司,联系人:殷志超,联系电话:021-22169419
     # '''
     print("start")
-    print(predict("12",content))
+    print(predict("12",content,"重庆市綦江区人民法院关于重庆市綦江区文龙街道沙溪路22号银海新城六期45号楼、46号楼、47号楼负一层213号车位(第一次拍卖)的公告"))
     # print(predict("投诉处理公告", text))
     #test("12",text)
     print("takes",time.time()-a)

+ 111 - 33
BiddingKG/maxcompute/documentDumplicate.py

@@ -374,6 +374,22 @@ class decare_document(BaseUDTF):
                                     new_json_set_docid.append(_item2)
                             self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
 
+def getBestDocid(list_pair):
+    list_pair.sort(key=lambda x:x[3],reverse=True)
+    _max_count = max(list_pair[0][3],list_pair[0][1])
+    set_candidate = set()
+    if list_pair[0][1]==_max_count:
+        set_candidate.add(list_pair[0][0])
+    for item in list_pair:
+        if item[3]==_max_count:
+            set_candidate.add(item[2])
+        else:
+            break
+    list_candidate = list(set_candidate)
+    list_candidate.sort(key=lambda x:x)
+    return list_candidate[0]
+
+
 @annotate('bigint,bigint,bigint,bigint->string')
 class choose_document(BaseUDAF):
     '''
@@ -395,32 +411,15 @@ class choose_document(BaseUDAF):
 
     def terminate(self, buffer):
         list_pair = buffer[0]
-        list_pair.sort(key=lambda x:x[3],reverse=True)
-        _max_count = list_pair[0][3]
-        save_flag = 0
-        list_dumplicate = []
         _set = set()
         for item in buffer[0]:
             _set.add(str(item[2]))
-        #不包含这条公告
-        # _set.add(list_pair[0][0])
-        if list_pair[0][1]>_max_count:
+        list_dumplicate = list(_set)
+        best_docid = getBestDocid(list_pair)
+        if best_docid==list_pair[0][0]:
             save_flag = 1
-            # _set.remove(list_pair[0][0])
-            list_dumplicate = list(_set)
         else:
-            if list_pair[0][1]<_max_count:
-                save_flag = 0
-            else:
-                less_docid = list_pair[0][0]
-                for item in list_pair:
-                    if item[3]>=_max_count and item[2]<less_docid:
-                        less_docid = item[2]
-                if less_docid==list_pair[0][0]:
-                    save_flag = 1
-                else:
-                    save_flag = 0
-            list_dumplicate = list(_set)
+            save_flag = 0
         return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
 
 
@@ -464,22 +463,11 @@ class group_document_bestFirst(BaseUDAF):
 
     def terminate(self, buffer):
         list_pair = buffer[0]
-        list_pair.sort(key=lambda x:x[3],reverse=True)
-        _max_count = list_pair[0][3]
-        save_flag = 0
-        list_dumplicate = []
         _set = set()
         for item in buffer[0]:
             _set.add(item[2])
         _set.add(list_pair[0][0])
-        best_docid = None
-        if list_pair[0][1]>_max_count:
-            best_docid = list_pair[0][0]
-        else:
-            best_docid = list_pair[0][2]
-            for item in list_pair:
-                if item[3]>=_max_count and item[2]<best_docid:
-                    best_docid = item[2]
+        best_docid = getBestDocid(list_pair)
         _set.remove(best_docid)
         list_dumplicate = list(_set)
         list_dumplicate.sort(key=lambda x:x)
@@ -616,3 +604,93 @@ class get_count_dump(object):
             _count = len(title.split(","))
         return _count
 
+def getSet(list_dict,key):
+    _set = set()
+    for item in list_dict:
+        if key in item:
+            if item[key]!='' and item[key] is not None:
+                if re.search("^\d[\d\.]*$",item[key]) is not None:
+                    _set.add(str(float(item[key])))
+                else:
+                    _set.add(str(item[key]))
+    return _set
+
+@annotate('bigint,string -> bigint,bigint')
+class f_getGroup_dumpFinal(BaseUDTF):
+    '''
+    从最后的结果中获取组
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,docid,dumplicates):
+        self.forward(int(docid),int(docid))
+        if dumplicates is not None:
+            list_docids = dumplicates.split(",")
+            for _docid in list_docids:
+                self.forward(int(docid),int(_docid))
+
+@annotate('bigint,bigint,string,string,string,string,bigint,bigint->string')
+class f_redump_limit_num(BaseUDAF):
+    '''
+    去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,main_docid,docid,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2):
+        buffer[0].append({"main_docid":main_docid,"docid":docid,"set_limit_column1":set_limit_column1,"set_limit_column2":set_limit_column2,
+                          "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,"extract_count2":extract_count2})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_group = []
+        the_group = buffer[0]
+        if len(the_group)>5:
+            keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
+        else:
+            keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
+        stay = True
+        for _key in keys:
+            if len(getSet(the_group,_key))>1:
+                stay = False
+                break
+        final_group = []
+        if stay:
+            main_docid = the_group[0]["main_docid"]
+            for item in the_group:
+                if item["docid"]!=main_docid:
+                    final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
+
+        return json.dumps(final_group)
+
+@annotate('string -> bigint,bigint,bigint,bigint')
+class f_get_dumpFinal_checked(BaseUDTF):
+    '''
+    从最后的结果中获取组
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,list_group):
+        if list_group is not None:
+            final_group = json.loads(list_group)
+            for _group in final_group:
+                self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"])