4 gadi atpakaļ · e56022842c
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -419,10 +419,10 @@ def tableToText(soup):
 
															                 inner_table[_h][_w][1] = 0
														
 
															-        # print("=====")
														
 
															-        # for item in inner_table:
														
 
															-        #     print(item)
														
 
															-        # print("======")
														
 
															+        print("=====")
														
 
															+        for item in inner_table:
														
 
															+            print(item)
														
 
															+        print("======")
														
 
															         repairTable(inner_table)
														
 
															         head_list = sliceTable(inner_table)
														
@@ -639,6 +639,11 @@ def tableToText(soup):
 
															             head_end = head_list[head_i+1]
														
 
															             direct = getDirect(inner_table, head_begin, head_end)
														
 
															+
														
 
															+            print("----")
														
 
															+            print(inner_table[head_begin:head_end])
														
 
															+            print("head_end-head_begin",head_end-head_begin)
														
 
															+            print(direct)
														
 
															             #若只有一行，则直接按行读取
														
 
															             if head_end-head_begin==1:
														
@@ -657,7 +662,16 @@ def tableToText(soup):
 
															                     text_line = text_line+"。" if text_line!="" else text_line
														
 
															                 text += text_line
														
 
															             else:
														
 
															-        
														
 
															+                #构建一个共现矩阵
														
 
															+                table_occurence = []
														
 
															+                for i in range(head_begin,head_end):
														
 
															+                    line_oc = []
														
 
															+                    for j in range(width):
														
 
															+                        cell = inner_table[i][j]
														
 
															+                        line_oc.append({"text":cell[0],"type":cell[1]})
														
 
															+                    table_occurence.append(line_oc)
														
 
															+
														
 
															+
														
 
															                 if direct=="row":
														
 
															                     for i in range(head_begin,head_end):
														
 
															                         pack_text = ""
														
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -834,7 +834,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                         if str(entity_before.label)=="1":
														
 
															                             addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
														
 
															                             #add pointer_money
														
 
															-                            entity_before.pointer_money = entity_after
														
 
															+                            entity_before.pointer_money = entity_money
														
 
															                         break
														
 
															                     p_entity_money -= 1
														
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -833,9 +833,9 @@ class FormPredictor():
 
															         else:
														
 
															             return self.getModel(type).predict(form_datas)
														
 
															-    
														
 
															+
														
 
															 #角色规则
														
 
															-#依据正则给所有无角色的实体赋予角色，给予等于阈值的最低概率    
														
 
															+#依据正则给所有无角色的实体赋予角色，给予等于阈值的最低概率
														
 
															 class RoleRulePredictor():
														
 
															     def __init__(self):
														
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -68,8 +68,8 @@ class MyEncoder(json.JSONEncoder):
 
															         return json.JSONEncoder.default(self, obj)
														
 
															-def predict(doc_id,text):
														
 
															-    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
														
 
															+def predict(doc_id,text,title=""):
														
 
															+    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
														
 
															     for articles in list_articles:
														
 
															         print(articles.content)
														
@@ -138,7 +138,7 @@ if __name__=="__main__":
 
															     # ,光大证券统一认证系统服务器硬件设备更新项目中标候选人公示，项目名称：光大证券统一认证系统服务器硬件设备更新项目，招标编号：CG-202011-030-001，公告日期：2020年12月3日，评标日期：2020年11月30日13时32分，评标地点：光大证券集中采购管理平台，推荐中标候选人：上海致为信息技术有限公司，联系人：殷志超，联系电话：021-22169419
														
 
															     # '''
														
 
															     print("start")
														
 
															-    print(predict("12",content))
														
 
															+    print(predict("12",content,"重庆市綦江区人民法院关于重庆市綦江区文龙街道沙溪路22号银海新城六期45号楼、46号楼、47号楼负一层213号车位（第一次拍卖）的公告"))
														
 
															     # print(predict("投诉处理公告", text))
														
 
															     #test("12",text)
														
 
															     print("takes",time.time()-a)
														
--- a/BiddingKG/maxcompute/documentDumplicate.py
+++ b/BiddingKG/maxcompute/documentDumplicate.py
@@ -374,6 +374,22 @@ class decare_document(BaseUDTF):
 
															                                     new_json_set_docid.append(_item2)
														
 
															                             self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
														
 
															+def getBestDocid(list_pair):
														
 
															+    list_pair.sort(key=lambda x:x[3],reverse=True)
														
 
															+    _max_count = max(list_pair[0][3],list_pair[0][1])
														
 
															+    set_candidate = set()
														
 
															+    if list_pair[0][1]==_max_count:
														
 
															+        set_candidate.add(list_pair[0][0])
														
 
															+    for item in list_pair:
														
 
															+        if item[3]==_max_count:
														
 
															+            set_candidate.add(item[2])
														
 
															+        else:
														
 
															+            break
														
 
															+    list_candidate = list(set_candidate)
														
 
															+    list_candidate.sort(key=lambda x:x)
														
 
															+    return list_candidate[0]
														
 
															+
														
 
															+
														
 
															 @annotate('bigint,bigint,bigint,bigint->string')
														
 
															 class choose_document(BaseUDAF):
														
 
															     '''
														
@@ -395,32 +411,15 @@ class choose_document(BaseUDAF):
 
															     def terminate(self, buffer):
														
 
															         list_pair = buffer[0]
														
 
															-        list_pair.sort(key=lambda x:x[3],reverse=True)
														
 
															-        _max_count = list_pair[0][3]
														
 
															-        save_flag = 0
														
 
															-        list_dumplicate = []
														
 
															         _set = set()
														
 
															         for item in buffer[0]:
														
 
															             _set.add(str(item[2]))
														
 
															-        #不包含这条公告
														
 
															-        # _set.add(list_pair[0][0])
														
 
															-        if list_pair[0][1]>_max_count:
														
 
															+        list_dumplicate = list(_set)
														
 
															+        best_docid = getBestDocid(list_pair)
														
 
															+        if best_docid==list_pair[0][0]:
														
 
															             save_flag = 1
														
 
															-            # _set.remove(list_pair[0][0])
														
 
															-            list_dumplicate = list(_set)
														
 
															         else:
														
 
															-            if list_pair[0][1]<_max_count:
														
 
															-                save_flag = 0
														
 
															-            else:
														
 
															-                less_docid = list_pair[0][0]
														
 
															-                for item in list_pair:
														
 
															-                    if item[3]>=_max_count and item[2]<less_docid:
														
 
															-                        less_docid = item[2]
														
 
															-                if less_docid==list_pair[0][0]:
														
 
															-                    save_flag = 1
														
 
															-                else:
														
 
															-                    save_flag = 0
														
 
															-            list_dumplicate = list(_set)
														
 
															+            save_flag = 0
														
 
															         return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
														
@@ -464,22 +463,11 @@ class group_document_bestFirst(BaseUDAF):
 
															     def terminate(self, buffer):
														
 
															         list_pair = buffer[0]
														
 
															-        list_pair.sort(key=lambda x:x[3],reverse=True)
														
 
															-        _max_count = list_pair[0][3]
														
 
															-        save_flag = 0
														
 
															-        list_dumplicate = []
														
 
															         _set = set()
														
 
															         for item in buffer[0]:
														
 
															             _set.add(item[2])
														
 
															         _set.add(list_pair[0][0])
														
 
															-        best_docid = None
														
 
															-        if list_pair[0][1]>_max_count:
														
 
															-            best_docid = list_pair[0][0]
														
 
															-        else:
														
 
															-            best_docid = list_pair[0][2]
														
 
															-            for item in list_pair:
														
 
															-                if item[3]>=_max_count and item[2]<best_docid:
														
 
															-                    best_docid = item[2]
														
 
															+        best_docid = getBestDocid(list_pair)
														
 
															         _set.remove(best_docid)
														
 
															         list_dumplicate = list(_set)
														
 
															         list_dumplicate.sort(key=lambda x:x)
														
@@ -616,3 +604,93 @@ class get_count_dump(object):
 
															             _count = len(title.split(","))
														
 
															         return _count
														
 
															+def getSet(list_dict,key):
														
 
															+    _set = set()
														
 
															+    for item in list_dict:
														
 
															+        if key in item:
														
 
															+            if item[key]!='' and item[key] is not None:
														
 
															+                if re.search("^\d[\d\.]*$",item[key]) is not None:
														
 
															+                    _set.add(str(float(item[key])))
														
 
															+                else:
														
 
															+                    _set.add(str(item[key]))
														
 
															+    return _set
														
 
															+
														
 
															+@annotate('bigint,string -> bigint,bigint')
														
 
															+class f_getGroup_dumpFinal(BaseUDTF):
														
 
															+    '''
														
 
															+    从最后的结果中获取组
														
 
															+    '''
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        import logging
														
 
															+        import json
														
 
															+        global json,logging
														
 
															+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															+
														
 
															+    def process(self,docid,dumplicates):
														
 
															+        self.forward(int(docid),int(docid))
														
 
															+        if dumplicates is not None:
														
 
															+            list_docids = dumplicates.split(",")
														
 
															+            for _docid in list_docids:
														
 
															+                self.forward(int(docid),int(_docid))
														
 
															+
														
 
															+@annotate('bigint,bigint,string,string,string,string,bigint,bigint->string')
														
 
															+class f_redump_limit_num(BaseUDAF):
														
 
															+    '''
														
 
															+    去重合并后重新判断，组内个数大于5时，dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
														
 
															+    组内个数小于等于5时，tenderee、win_tenderer、bidding_budget组内只能有一个取值
														
 
															+    '''
														
 
															+    def __init__(self):
														
 
															+        import logging
														
 
															+        import json,re
														
 
															+        global json,logging,re
														
 
															+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															+
														
 
															+    def new_buffer(self):
														
 
															+        return [list()]
														
 
															+
														
 
															+    def iterate(self, buffer,main_docid,docid,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2):
														
 
															+        buffer[0].append({"main_docid":main_docid,"docid":docid,"set_limit_column1":set_limit_column1,"set_limit_column2":set_limit_column2,
														
 
															+                          "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,"extract_count2":extract_count2})
														
 
															+
														
 
															+    def merge(self, buffer, pbuffer):
														
 
															+        buffer[0].extend(pbuffer[0])
														
 
															+
														
 
															+    def terminate(self, buffer):
														
 
															+        list_group = []
														
 
															+        the_group = buffer[0]
														
 
															+        if len(the_group)>5:
														
 
															+            keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
														
 
															+        else:
														
 
															+            keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
														
 
															+        stay = True
														
 
															+        for _key in keys:
														
 
															+            if len(getSet(the_group,_key))>1:
														
 
															+                stay = False
														
 
															+                break
														
 
															+        final_group = []
														
 
															+        if stay:
														
 
															+            main_docid = the_group[0]["main_docid"]
														
 
															+            for item in the_group:
														
 
															+                if item["docid"]!=main_docid:
														
 
															+                    final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
														
 
															+
														
 
															+        return json.dumps(final_group)
														
 
															+
														
 
															+@annotate('string -> bigint,bigint,bigint,bigint')
														
 
															+class f_get_dumpFinal_checked(BaseUDTF):
														
 
															+    '''
														
 
															+    从最后的结果中获取组
														
 
															+    '''
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        import logging
														
 
															+        import json
														
 
															+        global json,logging
														
 
															+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															+
														
 
															+    def process(self,list_group):
														
 
															+        if list_group is not None:
														
 
															+            final_group = json.loads(list_group)
														
 
															+            for _group in final_group:
														
 
															+                self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"])