Просмотр исходного кода

统计规则,招标人召回

luojiehua 3 лет назад
Родитель
Сommit
40c4f32ff5
2 измененных файлов с 114 добавлено и 1 удалено
  1. 12 1
      BiddingKG/dl/metrics/extractMetric.py
  2. 102 0
      BiddingKG/maxcompute/filltenderee.py

+ 12 - 1
BiddingKG/dl/metrics/extractMetric.py

@@ -139,6 +139,8 @@ class ExtractMetric():
             _split = v.get("type").split("_")
             if v.get("type") in ["money_tendereeMoney"]:
                 _before_text = Htext[max(v["begin"]-10,0):v["begin"]]
+                if re.search('总投资|投资总额|总预算|总概算|投资规模|投资|工程造价', _before_text):
+                    continue
                 if re.search("万",_before_text) is not None and re.search("整",_before_text) is None:
                     _unit = 10000
                 else:
@@ -150,6 +152,8 @@ class ExtractMetric():
                 for _k,_v in dict_role.items():
                     if _v["subject"]==dict_anno[arg1]["text"]:
                         _before_text = Htext[max(dict_anno[arg2]["begin"]-10,0):dict_anno[arg2]["begin"]]
+                        if re.search('总投资|投资总额|总预算|总概算|投资规模|投资|工程造价', _before_text):
+                            continue
                         if re.search("万",_before_text) is not None and re.search("整",_before_text) is None:
                             _unit = 10000
                         else:
@@ -222,7 +226,7 @@ class ExtractMetric():
             _user = _payroll[2]
             doc_count = _payroll[3]
             print(_user,_begin_time,_end_time,doc_count)
-            _sql = "select document_id,value from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' limit 10)  order by document_id"%(_user,_begin_time,_end_time)
+            _sql = "select document_id,value from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' limit 100)  order by document_id"%(_user,_begin_time,_end_time)
             cursor.execute(_sql)
             rows = cursor.fetchall()
             if len(rows)>0:
@@ -341,6 +345,13 @@ class ExtractMetric():
                     _score = jaccard_score(dict_project.get(k),dict_project.get(k_other))
                     if _score>0.9:
                         _dict["%s_union"%base_key] = 1
+                else:
+                    #通过规则召回的也算
+                    if dict_project.get("%s_%s"%(base_key,"inter")) is None and dict_project.get("%s_%s"%(base_key,"inter2")) is not None:
+                        _dict[k] = 1
+                        _dict[k_other] = 1
+                        _dict["%s_union"%base_key] = 1
+
             else:
                 _dict["%s_union"%base_key] = len(set(v)&set(dict_project.get(k_other,[])))
             set_k.add(base_key)

+ 102 - 0
BiddingKG/maxcompute/filltenderee.py

@@ -0,0 +1,102 @@
+#coding:UTF8
+
+from odps.udf import annotate
+from odps.udf import BaseUDAF
+from odps.udf import BaseUDTF
+import re
+import time
+import json
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import math
+
+@annotate('string->string')
+class f_splitProduct(BaseUDTF):
+
+    def process(self,product):
+        if product is None:
+            return
+        for str_p in product.split(","):
+            self.forward(str_p)
+
+def getTimeStamp(str_time):
+    try:
+        if str_time is not None and re.search("\d{4}\-\d{2}\-\d{2}.*",str_time) is not None:
+            timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
+            timeStamp = int(time.mktime(timeArray))
+            return timeStamp
+        else:
+            return 0
+    except Exception as e:
+        return 0
+
+@annotate('string->string')
+class f_groupproduct(BaseUDAF):
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self,buffer, page_time):
+        timestamp = getTimeStamp(page_time)
+        if timestamp>0:
+            _set = set(buffer[0])
+            _set.add(timestamp)
+            _list = list(_set)
+            _list.sort(key=lambda x:x,reverse=True)
+            buffer[0] = _list[:10000]
+
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        _set = set(buffer[0])
+        _list = list(_set)
+        _list.sort(key=lambda x:x,reverse=True)
+        buffer[0] = _list[:10000]
+
+    def terminate(self, buffer):
+        return json.dumps(buffer[0],ensure_ascii=False)
+
+@annotate('string->bigint')
+class f_isdistinct(BaseUDAF):
+
+    def new_buffer(self):
+        return [{}]
+
+    def iterate(self,buffer, tenderee):
+        if len(buffer[0].keys())>20:
+            return
+        _key = tenderee
+        if tenderee is None or tenderee=="":
+            _key = "None"
+        if _key not in buffer[0]:
+            buffer[0][_key] = 0
+        buffer[0][_key] += 1
+        _key = "whole"
+        if _key not in buffer[0]:
+            buffer[0][_key] = 0
+        buffer[0][_key] += 1
+
+
+    def merge(self, buffer, pbuffer):
+        for k,v in pbuffer[0].items():
+            if k in buffer[0]:
+                buffer[0][k] += v
+            else:
+                buffer[0][k] = v
+
+    def terminate(self, buffer):
+        _dict = buffer[0]
+        if len(_dict.keys())>20:
+            return 0
+        _whole = _dict.get("whole",-1)
+        list_v = []
+        _empty = _dict.get("None",-1)
+        for k,v in _dict.items():
+            if k=="None":
+                continue
+            list_v.append(v)
+        _max = max(list_v)
+        if (_max+_empty)/_whole>0.9 and _max/_whole>0.4:
+            return 1
+
+        return 0