3 years ago · 37ed82097b
--- a/.idea/sonarlint/issuestore/index.pb
+++ b/.idea/sonarlint/issuestore/index.pb
@@ -6,4 +6,38 @@ P
 
				 Y
			
 
				 )BiddingKG/dl/relation_extraction/model.py,1\8\18960bd2f3d2ffcab4e29ee67a5dbda20d72990a
			
 
				 G
			
 
				-BiddingKG/dl/test/11.py,c\5\c53e135d56ebfaa441dcb190f2a6436d863d6dde
			
 
				+BiddingKG/dl/test/11.py,c\5\c53e135d56ebfaa441dcb190f2a6436d863d6dde
			
 
				+a
			
 
				+1BiddingKG/deepdive/test/testArticle_processed2.py,a\f\afe8cc0ca1038654e923f628b8bfb01ce464db17
			
 
				+Y
			
 
				+)BiddingKG/maxcompute/contactDumplicate.py,0\9\09c1550b10f6ce58546ad5d4b900071c2a8292cc
			
 
				+S
			
 
				+#BiddingKG/dl/time/re_servicetime.py,4\4\4454e65be42efdd433a1de3147c6f3cb69cf116b
			
 
				+U
			
 
				+%BiddingKG/maxcompute/attachmentRec.py,b\e\be8f50b8961bc8ae61e105517763f21c707ea3ec
			
 
				+O
			
 
				+BiddingKG/dl/common/nerUtils.py,8\2\82c3c87116c1da9281790ac9c71f57821e9207cf
			
 
				+Q
			
 
				+!BiddingKG/dl/LEGAL_ENTERPRISE.txt,6\8\685bd49ae2f5f0de419c93a217a0e57564d705ab
			
 
				+I
			
 
				+BiddingKG/maxcompute/1.py,5\8\58fa6fe30194ad773c47ea70f4e48401242a1a88
			
 
				+Q
			
 
				+!BiddingKG/dl/interface/Entitys.py,6\3\6394a73f962d314de6209d5bb823941b56eda9d7
			
 
				+H
			
 
				+BiddingKG/dl/__init__.py,a\c\ac12bb80834a26e34df8eaf4c762410dfcfc0a27
			
 
				+U
			
 
				+%BiddingKG/dl/metrics/extractMetric.py,f\e\fed725bbe7e61499dcc542a2cd6279850a62cb79
			
 
				+L
			
 
				+BiddingKG/dl/common/Utils.py,f\4\f4c35e30342829a2fc89108259e28edc0a425cce
			
 
				+W
			
 
				+'BiddingKG/maxcompute/article_extract.py,1\d\1d533d48614eebe6b6e03d0bf64b381cdf4beca0
			
 
				+]
			
 
				+-BiddingKG/dl/test/测试所有提取信息.py,3\b\3baf1cd5607ac9f5b6263b25a3a5a1570221f784
			
 
				+I
			
 
				+BiddingKG/dl/test/t2/1.py,1\a\1a12a3e742b3f8f9ff5623b9eb9b471824b3db44
			
 
				+P
			
 
				+ BiddingKG/dl/test/t2/__init__.py,6\e\6e2a437853f56392367a0fb812234f339cb553b4
			
 
				+S
			
 
				+#BiddingKG/dl/interface/predictor.py,8\e\8efac754add7c721c357d60e457cffc0d592e7d1
			
 
				+U
			
 
				+%BiddingKG/dl/entityLink/entityLink.py,6\2\629ab9c7dc55af50771db3959ddf1e23b4ee5208
			
--- a/BiddingKG/dl/test/12.py
+++ b/BiddingKG/dl/test/12.py
@@ -1,19 +1,30 @@
 
				-
			
 
				-from multiprocessing import Queue,Process,freeze_support
			
 
				 import time
			
 
				 
			
 
				-def f():
			
 
				-    while(True):
			
 
				-        print(1)
			
 
				-        time.sleep(1)
			
 
				-
			
 
				-if __name__=="__main__":
			
 
				-    freeze_support()
			
 
				-    while(True):
			
 
				-        p = Process(target=f)
			
 
				-        p.start()
			
 
				-        p.join()
			
 
				-
			
 
				-        # time.sleep(4)
			
 
				-        # p.terminate()
			
 
				-        # p.start()
			
 
				+import math
			
 
				+def getAvgD(aint_dis):
			
 
				+    if len(aint_dis)==0:
			
 
				+        return 0
			
 
				+    avg_dis = 1
			
 
				+    int_avgD = int(sum(aint_dis)/len(aint_dis))
			
 
				+    new_aint_dis = [a for a in aint_dis]
			
 
				+    print(sum(aint_dis)/len(aint_dis))
			
 
				+    min_pow = 10000000
			
 
				+    min_dis = min(aint_dis)
			
 
				+
			
 
				+    for _dis in range(min(aint_dis),max(aint_dis)+1):
			
 
				+
			
 
				+        pow_x = 0
			
 
				+        for _d in new_aint_dis:
			
 
				+            pow_x += math.sqrt(abs((_d-_dis)))
			
 
				+        print(_dis,pow_x)
			
 
				+        if pow_x<min_pow:
			
 
				+            min_pow = pow_x
			
 
				+            min_dis = _dis
			
 
				+
			
 
				+    return min_dis
			
 
				+
			
 
				+import re
			
 
				+print(re.search('"data": \{"\d+": \[\]\}',"2"))
			
 
				+
			
 
				+
			
 
				+
			
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -66,7 +66,7 @@ if __name__=="__main__":
 
				     # 广州比地数据科技有限公司翻译服务工程招标
			
 
				     # '''
			
 
				     # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
			
 
				-    print(predict("12", text))
			
 
				+    print(predict("12", content,"打印机"))
			
 
				     # test("12",content)
			
 
				     print("takes",time.time()-_time1)
			
 
				     pass
			
--- a/BiddingKG/maxcompute/documentDumplicate.py
+++ b/BiddingKG/maxcompute/documentDumplicate.py
@@ -13,6 +13,21 @@ class f_decode_extract(BaseUDTF):
 
				         global json,logging,time,re
			
 
				         self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
			
 
				         logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+        self.dict_channel = {"公告变更":51,
			
 
				+                             "招标公告":52,
			
 
				+                             "中标信息":101,
			
 
				+                             "招标预告":102,
			
 
				+                             "招标答疑":103,
			
 
				+                             "资审结果":105,
			
 
				+                             "法律法规":106,
			
 
				+                             "新闻资讯":107,
			
 
				+                             "采购意向":114,
			
 
				+                             "拍卖出让":115,
			
 
				+                             "土地矿产":116,
			
 
				+                             "产权交易":117,
			
 
				+                             "废标公告":118,
			
 
				+                             "候选人公示":119,
			
 
				+                             "合同公告":120}
			
 
				 
			
 
				     def process(self, extractjson,otherjson):
			
 
				         if extractjson is not None:
			
@@ -42,7 +57,10 @@ class f_decode_extract(BaseUDTF):
 
				         city = _other.get("city","")
			
 
				         district = _other.get("district","")
			
 
				         web_source_no = _other.get("webSourceNo","")
			
 
				-        docchannel = _other.get("docchannel",0)
			
 
				+
			
 
				+
			
 
				+        # docchannel = _other.get("docchannel",0)
			
 
				+        docchannel = self.dict_channel.get(_extract.get("docchannel",""),0)
			
 
				         if re.search(self.time_pattern,page_time) is not None:
			
 
				             try:
			
 
				                 timeArray = time.strptime(page_time[:11], "%Y-%m-%d")
			
--- a/BiddingKG/maxcompute/documentMerge.py
+++ b/BiddingKG/maxcompute/documentMerge.py
@@ -350,6 +350,224 @@ class f_remege_limit_num_contain(BaseUDAF):
 
				         log(str(final_group))
			
 
				         return json.dumps(final_group)
			
 
				 
			
 
				+def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
			
 
				+    _time = time.strftime(format,time.localtime())
			
 
				+    return _time
			
 
				+
			
 
				+@annotate('bigint->string')
			
 
				+class f_get_single_merged_bychannel(BaseUDTF):
			
 
				+
			
 
				+    def process(self,docid):
			
 
				+        _d = {"data":{str(docid):[]},"process_time":getCurrent_date()}
			
 
				+        self.forward(json.dumps(_d))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+@annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,bigint->string')
			
 
				+class f_remege_limit_num_contain_bychannel(BaseUDAF):
			
 
				+    '''
			
 
				+    项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
			
 
				+    '''
			
 
				+    def __init__(self):
			
 
				+        import logging
			
 
				+        import json,re
			
 
				+        global json,logging,re
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+    def new_buffer(self):
			
 
				+        return [list()]
			
 
				+
			
 
				+    def iterate(self, buffer,docid,docchannel,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column1,contain_column2,notLike_column,confidence):
			
 
				+        _dict = {"docid":docid,"docchannel":docchannel,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
			
 
				+                 "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
			
 
				+                 "contain_column1":contain_column1,"contain_column2":contain_column2,"notLike_column":notLike_column,"confidence":confidence}
			
 
				+        _count = 0
			
 
				+        for _,v in _dict.items():
			
 
				+            if v is not None and str(v)!="":
			
 
				+                _count += 1
			
 
				+        _dict["extract_count"] = _count
			
 
				+        buffer[0].append(_dict)
			
 
				+
			
 
				+    def merge(self, buffer, pbuffer):
			
 
				+        buffer[0].extend(pbuffer[0])
			
 
				+
			
 
				+    def getNotLikeSet(self,_dict,column_name):
			
 
				+        column_value = _dict.get(column_name,None)
			
 
				+        _set = set()
			
 
				+        if column_value is not None:
			
 
				+            for _i in range(1,len(column_value)):
			
 
				+                _set.add(column_value[_i-1:_i+1])
			
 
				+        _dict["notLike_set"] = _set
			
 
				+
			
 
				+    def getSimilarity(self,_set1,_set2):
			
 
				+        _sum = max([1,min([len(_set1),len(_set2)])])
			
 
				+        return len(_set1&_set2)/_sum
			
 
				+
			
 
				+    def terminate(self, buffer):
			
 
				+        list_group = []
			
 
				+        the_group = buffer[0]
			
 
				+
			
 
				+        SIM_PROB = 0.6
			
 
				+        for _d in the_group:
			
 
				+            self.getNotLikeSet(_d,"notLike_column")
			
 
				+
			
 
				+        #判断多个值与否
			
 
				+        keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
			
 
				+        re_merge = False
			
 
				+        for _key in keys:
			
 
				+            if len(getSet(the_group,_key))>1:
			
 
				+                re_merge = True
			
 
				+                break
			
 
				+        #判断是否相似而不相同
			
 
				+        re_merge_sim = False
			
 
				+        for _i1 in range(0,len(the_group)):
			
 
				+            for _j1 in range(_i1+1,len(the_group)):
			
 
				+                _set1 = the_group[_i1]["notLike_set"]
			
 
				+                _set2 = the_group[_j1]["notLike_set"]
			
 
				+                _sim = self.getSimilarity(_set1,_set2)
			
 
				+                if _sim>SIM_PROB and _sim<1:
			
 
				+                    re_merge_sim = True
			
 
				+                    break
			
 
				+        contain_keys = ["contain_column1","contain_column2"]
			
 
				+
			
 
				+        logging.info(the_group)
			
 
				+        logging.info(str(re_merge)+str(re_merge_sim))
			
 
				+        #重新成组
			
 
				+        dict_docid_doc = {}
			
 
				+        for _doc in the_group:
			
 
				+            dict_docid_doc[_doc["docid"]] = _doc
			
 
				+        if re_merge or re_merge_sim:
			
 
				+            the_group.sort(key=lambda x:x["confidence"],reverse=True)
			
 
				+            the_group.sort(key=lambda x:x["page_time_stamp"])
			
 
				+
			
 
				+            for _doc in the_group:
			
 
				+                merge_flag = False
			
 
				+                for _index in range(len(list_group)):
			
 
				+                    _g = list_group[_index]
			
 
				+                    hit_count = 0
			
 
				+                    dict_temp = dict()
			
 
				+                    #多个值的异常
			
 
				+                    if re_merge:
			
 
				+                        for _c_key in contain_keys:
			
 
				+                            dict_temp[_c_key] = _g[_c_key]
			
 
				+                            if _g[_c_key] is not None and _doc[_c_key] is not None:
			
 
				+                                if len(_g[_c_key])>len(_doc[_c_key]):
			
 
				+                                    if str(_g[_c_key]).find(str(_doc[_c_key]))>=0:
			
 
				+                                        dict_temp[_c_key] = _g[_c_key]
			
 
				+                                        hit_count += 1
			
 
				+                                else:
			
 
				+                                    if str(_doc[_c_key]).find(str(_g[_c_key]))>=0:
			
 
				+                                        dict_temp[_c_key] = _doc[_c_key]
			
 
				+                                        _g[_c_key] = _doc[_c_key]
			
 
				+                                        hit_count += 1
			
 
				+                    else:
			
 
				+                        hit_count = 1
			
 
				+                    # if hit_count==len(contain_keys):
			
 
				+                    if hit_count>0:
			
 
				+                        _flag_sim = False
			
 
				+                        #相似而不相同的异常
			
 
				+                        if re_merge_sim:
			
 
				+                            for _docid in _g["docid"]:
			
 
				+                                tmp_d = dict_docid_doc[_docid]
			
 
				+                                _sim = self.getSimilarity(tmp_d["notLike_set"],_doc["notLike_set"])
			
 
				+                                if _sim>SIM_PROB and _sim<1:
			
 
				+                                    _flag_sim = True
			
 
				+                        if not _flag_sim:
			
 
				+                            for _c_key in dict_temp.keys():
			
 
				+                                _g[_c_key] = dict_temp[_c_key]
			
 
				+                            _g["docid"].append(_doc["docid"])
			
 
				+                            merge_flag = True
			
 
				+                            break
			
 
				+                if not merge_flag:
			
 
				+                    _dict = dict()
			
 
				+                    _dict["docid"] = [_doc["docid"]]
			
 
				+                    for _c_key in contain_keys:
			
 
				+                        _dict[_c_key] = _doc[_c_key]
			
 
				+                    list_group.append(_dict)
			
 
				+
			
 
				+            final_group = []
			
 
				+            #判断是否符合一个值
			
 
				+            for _group in list_group:
			
 
				+                _split = []
			
 
				+                for _docid in _group["docid"]:
			
 
				+                    _split.append(dict_docid_doc[_docid])
			
 
				+
			
 
				+                #通过置信度排序，尽可能保留组
			
 
				+                _split.sort(key=lambda x:x["confidence"],reverse=True)
			
 
				+                #置信度
			
 
				+                list_key_index = []
			
 
				+                for _k in keys:
			
 
				+                    list_key_index.append(getDiffIndex(_split,_k))
			
 
				+
			
 
				+                _index = min(list_key_index)
			
 
				+
			
 
				+
			
 
				+                final_group.append([_c["docid"] for _c in _split[:_index]])
			
 
				+                for _c in _split[_index:]:
			
 
				+                    final_group.append([_c["docid"]])
			
 
				+
			
 
				+
			
 
				+                #若是找到两个以上，则全部单独成组，否则成一组
			
 
				+                # _flag = True
			
 
				+                # for _key in keys:
			
 
				+                #     if len(getSet(_split,_key))>1:
			
 
				+                #         _flag = False
			
 
				+                #         break
			
 
				+                # if not _flag:
			
 
				+                #     for _docid in _group["docid"]:
			
 
				+                #         final_group.append([_docid])
			
 
				+                # else:
			
 
				+                #     final_group.append(list(set(_group["docid"])))
			
 
				+        else:
			
 
				+            final_group = [list(set([item["docid"] for item in the_group]))]
			
 
				+        log(str(final_group))
			
 
				+
			
 
				+
			
 
				+        #每个channel选择一篇公告
			
 
				+        final_group_channel = []
			
 
				+        for _group in final_group:
			
 
				+            dict_channel_id = {}
			
 
				+            otherChannel = 10000
			
 
				+            for _docid in _group:
			
 
				+                _channel = dict_docid_doc[_docid].get("docchannel")
			
 
				+                if _channel in [115,116,117]:
			
 
				+                    otherChannel += 1
			
 
				+                    _channel = otherChannel
			
 
				+                if _channel not in dict_channel_id:
			
 
				+                    dict_channel_id[_channel] = []
			
 
				+                dict_channel_id[_channel].append([_docid,dict_docid_doc[_docid].get("page_time_stamp"),dict_docid_doc[_docid].get("extract_count")])
			
 
				+            channel_dict = {}
			
 
				+            for k,v in dict_channel_id.items():
			
 
				+                v.sort(key=lambda x:x[1])
			
 
				+                v.sort(key=lambda x:x[2],reverse=True)
			
 
				+                channel_dict[v[0][0]] = []
			
 
				+                for _docs in v[1:]:
			
 
				+                    channel_dict[v[0][0]].append(_docs[0])
			
 
				+            _d = {"data":channel_dict,"process_time":getCurrent_date()}
			
 
				+            final_group_channel.append(_d)
			
 
				+
			
 
				+
			
 
				+        return json.dumps(final_group_channel)
			
 
				+
			
 
				+@annotate('string -> string')
			
 
				+class f_get_remerge_group_channel(BaseUDTF):
			
 
				+    '''
			
 
				+    将多个组拆解成多条记录
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        import logging
			
 
				+        import json
			
 
				+        global json,logging
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+
			
 
				+    def process(self,json_remerge):
			
 
				+        if json_remerge is not None:
			
 
				+            list_group = json.loads(json_remerge)
			
 
				+            for _group in list_group:
			
 
				+                self.forward(json.dumps(_group))
			
 
				+
			
 
				 @annotate('string -> string')
			
 
				 class f_get_remerge_group(BaseUDTF):
			
 
				     '''
			
@@ -840,7 +1058,29 @@ class f_getMergeProb(BaseUDTF):
 
				 
			
 
				 
			
 
				 
			
 
				+@annotate('string -> bigint,bigint')
			
 
				+class f_check_remerge_channel(BaseUDTF):
			
 
				+    '''
			
 
				+    将多个组拆解成多条记录
			
 
				+    '''
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        import logging
			
 
				+        import json
			
 
				+        global json,logging
			
 
				+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				 
			
 
				+    def process(self,json_remerge):
			
 
				+        if json_remerge is not None:
			
 
				+            list_group = json.loads(json_remerge)
			
 
				+            for _group in list_group:
			
 
				+                _keys = _group.get("data").keys()
			
 
				+                if len(_keys)>0:
			
 
				+                    main_docid = int(list(_keys)[0])
			
 
				+                    for k,v in _group.get("data",{}).items():
			
 
				+                        self.forward(main_docid,int(k))
			
 
				+                        for _v in v:
			
 
				+                            self.forward(main_docid,int(_v))
			
 
				 
			
 
				 @annotate('string -> bigint,bigint')
			
 
				 class f_check_remerge(BaseUDTF):