Ver Fonte

包号错误的问题

rogel há 4 anos atrás
pai
commit
f44d20d5fa

+ 8 - 4
BiddingKG/dl/interface/getAttributes.py

@@ -530,11 +530,11 @@ def getPackagesFromArticle(list_sentence,list_entity):
     PackageSet = set()
     dict_packageCode = dict()
     
-    package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
-    package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
-    package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
+    package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]*([^,。]{3,30})")
+    package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2}")
+    package_number_pattern = re.compile("(([^承]*(包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
     # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)')  # 新正则识别标段
-    other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{,50}?)(,|。)')  #  # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
+    other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]*([^,。]{3,50})')  #  # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
     win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{,25})(,|。)')  # 2020/11/23 大网站规则 调整
     model_pattern = re.compile('(型号|序号)[::]([^,。]{,20})(,|。)')  # 2020/11/23 大网站规则 调整
     number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
@@ -594,7 +594,9 @@ def getPackagesFromArticle(list_sentence,list_entity):
             names = re.findall(other_package_pattern, content)
         N_names = re.findall(package_N_name_pattern,content)
         if len(names)==1 and len(N_names)==1:
+            print("=====",names,N_names)
             package_names.append([names[0][-1],N_names[0][-1]])
+    print("=====",package_names)
     for i in range(len(list_sentence)):
         PackageList_item = []
         PackageList_item_scope = []
@@ -627,6 +629,7 @@ def getPackagesFromArticle(list_sentence,list_entity):
         PackageList_scope = PackageList_scope+PackageList_item_scope
         PackageList_item.sort(key=lambda x:x["sentence_index"])
         #PackageList = PackageList+PackageList_item
+    print("=====",PackageList_scope)
     #不作为包
     # if len(PackageSet)==0:
     #     for i in range(len(list_sentence)):
@@ -683,6 +686,7 @@ def getPackagesFromArticle(list_sentence,list_entity):
             PackageList_scope = PackageList_scope+PackageList_item_scope
             PackageList_item.sort(key=lambda x:x["sentence_index"])
 
+
     pattern_punctuation = "[::()\(\),,。;;]"
     for i in range(len(list_sentence)):
         for j in range(len(PackageList_scope)):

+ 3 - 3
BiddingKG/dl/interface/modelFactory.py

@@ -170,7 +170,7 @@ class Model_person_classify():
       if self.model_person is None:
         with self.sess_person.as_default() as sess:
           with sess.graph.as_default():
-            meta_graph_def = tf.saved_model.loader.load(sess,tags=["serve"],export_dir=os.path.dirname(__file__)+"/person_savedmodel")
+            meta_graph_def = tf.saved_model.loader.load(sess,tags=["serve"],export_dir=os.path.dirname(__file__)+"/person_savedmodel_backup")
             signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
             signature_def = meta_graph_def.signature_def
             
@@ -195,8 +195,8 @@ class Model_person_classify():
     '''
     
     def encode(self,tokens,begin_index,end_index,**kwargs):
-        # return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=10),shape=(2,10,128))
-        return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=35),shape=(2,35,128))
+        return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=10),shape=(2,10,128))
+        # return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=35),shape=(2,35,128))
 
     def predict(self,x):
         x = np.transpose(np.array(x),(1,0,2,3))

+ 8 - 8
BiddingKG/dl/test/test4.py

@@ -113,9 +113,9 @@ def test(name,content):
 
 if __name__=="__main__":
     # filename = "比地_52_79929693.html"
-    # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
-    # content = str(BeautifulSoup(text).find("div",id="pcontent"))
+    #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
+    content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
     # import pandas as pd
@@ -134,12 +134,12 @@ if __name__=="__main__":
     # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
     # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
     a = time.time()
-    text = '''
-    ,光大证券统一认证系统服务器硬件设备更新项目中标候选人公示,项目名称:光大证券统一认证系统服务器硬件设备更新项目,招标编号:CG-202011-030-001,公告日期:2020年12月3日,评标日期:2020年11月30日13时32分,评标地点:光大证券集中采购管理平台,推荐中标候选人:上海致为信息技术有限公司,联系人:殷志超,联系电话:021-22169419
-    '''
+    # text = '''
+    # ,光大证券统一认证系统服务器硬件设备更新项目中标候选人公示,项目名称:光大证券统一认证系统服务器硬件设备更新项目,招标编号:CG-202011-030-001,公告日期:2020年12月3日,评标日期:2020年11月30日13时32分,评标地点:光大证券集中采购管理平台,推荐中标候选人:上海致为信息技术有限公司,联系人:殷志超,联系电话:021-22169419
+    # '''
     print("start")
-    # print(predict("12",content))
-    print(predict("投诉处理公告", text))
+    print(predict("12",content))
+    # print(predict("投诉处理公告", text))
     #test("12",text)
     print("takes",time.time()-a)
     pass

+ 13 - 8
BiddingKG/maxcompute/documentDumplicate.py

@@ -213,7 +213,8 @@ class f_set_docid(BaseUDAF):
             _set_tenderee = set()
             _group = []
             for j in range(_begin,len(list_docs)):
-                _set_tenderee.add(list_docs[j]["tenderee"])
+                if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
+                    _set_tenderee.add(list_docs[j]["tenderee"])
                 _set_column.add(list_docs[j]["defind_column"])
                 _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
             if len(_group)>=3 and len(_set_tenderee)>1:
@@ -408,14 +409,18 @@ class choose_document(BaseUDAF):
             # _set.remove(list_pair[0][0])
             list_dumplicate = list(_set)
         else:
-            save_flag = 0
-            less_docid = list_pair[0][2]
-            for item in list_pair:
-                if item[3]>=_max_count and item[2]<less_docid:
-                    less_docid = item[2]
-            _set.remove(str(less_docid))
+            if list_pair[0][1]<_max_count:
+                save_flag = 0
+            else:
+                less_docid = list_pair[0][0]
+                for item in list_pair:
+                    if item[3]>=_max_count and item[2]<less_docid:
+                        less_docid = item[2]
+                if less_docid==list_pair[0][0]:
+                    save_flag = 1
+                else:
+                    save_flag = 0
             list_dumplicate = list(_set)
-            list_dumplicate.insert(0,str(less_docid))
         return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})