瀏覽代碼

产品字段更新+标注系统改造

rogel 4 年之前
父節點
當前提交
793ad3a456
共有 65 個文件被更改,包括 2006 次插入407 次删除
  1. 0 0
      examples/__init__.py
  2. 0 0
      examples/coreline/__init__.py
  3. 0 0
      examples/coreline/annotation.conf
  4. 0 0
      examples/coreline/articles.csv
  5. 0 0
      examples/coreline/bin/2020-08-01-2020-08-31要素标注统计.xls
  6. 0 0
      examples/coreline/bin/None-2020-09-25要素标注统计.xls
  7. 0 0
      examples/coreline/bin/None-2020-10-31要素标注统计.xls
  8. 0 0
      examples/coreline/bin/None-2020-11-25要素标注统计.xls
  9. 0 0
      examples/coreline/bin/None-2020-12-25要素标注统计.xls
  10. 0 0
      examples/coreline/bin/__init__.py
  11. 0 0
      examples/coreline/bin/csv_to_iepy.py
  12. 0 0
      examples/coreline/bin/gazettes_loader.py
  13. 0 0
      examples/coreline/bin/iepy_rules_runner.py
  14. 0 0
      examples/coreline/bin/iepy_runner.py
  15. 0 0
      examples/coreline/bin/manage.py
  16. 0 0
      examples/coreline/bin/preprocess.py
  17. 0 0
      examples/coreline/bin/rules_verifier.py
  18. 41 4
      examples/coreline/bin/settlement.py
  19. 0 0
      examples/coreline/extractor_config.json
  20. 20 0
      examples/coreline/gunicorn_django.py
  21. 0 0
      examples/coreline/rules.py
  22. 7 0
      examples/coreline/settings.py
  23. 0 0
      examples/coreline/test.sqlite
  24. 7 0
      examples/coreline/wsgi.py
  25. 69 0
      examples/nginx.conf
  26. 5 0
      examples/start.md
  27. 二進制
      examples/test/bin/分组_1.xls
  28. 二進制
      examples/test/bin/分组_10.xls
  29. 二進制
      examples/test/bin/分组_2.xls
  30. 二進制
      examples/test/bin/分组_3.xls
  31. 二進制
      examples/test/bin/分组_4.xls
  32. 二進制
      examples/test/bin/分组_5.xls
  33. 二進制
      examples/test/bin/分组_6.xls
  34. 二進制
      examples/test/bin/分组_7.xls
  35. 二進制
      examples/test/bin/分组_8.xls
  36. 二進制
      examples/test/bin/分组_9.xls
  37. 13 8
      iepy/data/models.py
  38. 二進制
      iepy/selfpreprocess/BiddingKG/dl/complaint/models/punish_code.pb
  39. 473 0
      iepy/selfpreprocess/BiddingKG/dl/complaint/punish_predictor.py
  40. 98 70
      iepy/selfpreprocess/BiddingKG/dl/complaint/punish_rule.py
  41. 244 215
      iepy/selfpreprocess/BiddingKG/dl/interface/Preprocessing.py
  42. 3 0
      iepy/selfpreprocess/BiddingKG/dl/interface/getAttributes.py
  43. 94 8
      iepy/selfpreprocess/BiddingKG/dl/interface/predictor.py
  44. 二進制
      iepy/selfpreprocess/BiddingKG/dl/interface/product_savedmodel/product.pb
  45. 二進制
      iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/saved_model.pb
  46. 二進制
      iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001
  47. 二進制
      iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/variables/variables.index
  48. 0 0
      iepy/selfpreprocess/BiddingKG/dl/product/__init__.py
  49. 155 0
      iepy/selfpreprocess/BiddingKG/dl/product/data_util.py
  50. 68 4
      iepy/selfpreprocess/self_preprocess.py
  51. 159 0
      iepy/selfpreprocess/test4.py
  52. 1 1
      iepy/webui/brat/models.py
  53. 83 14
      iepy/webui/brat/src/annotation.py
  54. 0 3
      iepy/webui/brat/src/config.py
  55. 3 1
      iepy/webui/brat/src/dispatch.py
  56. 96 22
      iepy/webui/brat/src/document.py
  57. 13 12
      iepy/webui/brat/src/search.py
  58. 9 0
      iepy/webui/brat/static/client/src/annotator_ui.js
  59. 51 0
      iepy/webui/brat/static/client/src/custom.js
  60. 1 0
      iepy/webui/brat/static/client/src/url_monitor.js
  61. 53 9
      iepy/webui/brat/static/client/src/visualizer.js
  62. 123 2
      iepy/webui/brat/static/client/src/visualizer_ui.js
  63. 1 1
      iepy/webui/brat/static/style-ui.css
  64. 114 31
      iepy/webui/brat/templates/brat/index.html
  65. 2 2
      iepy/webui/brat/views.py

+ 0 - 0
examples/__init__.py


+ 0 - 0
examples/test/__init__.py → examples/coreline/__init__.py


+ 0 - 0
examples/test/annotation.conf → examples/coreline/annotation.conf


+ 0 - 0
examples/test/articles.csv → examples/coreline/articles.csv


+ 0 - 0
examples/test/bin/2020-08-01-2020-08-31要素标注统计.xls → examples/coreline/bin/2020-08-01-2020-08-31要素标注统计.xls


+ 0 - 0
examples/test/bin/None-2020-09-25要素标注统计.xls → examples/coreline/bin/None-2020-09-25要素标注统计.xls


+ 0 - 0
examples/test/bin/None-2020-10-31要素标注统计.xls → examples/coreline/bin/None-2020-10-31要素标注统计.xls


+ 0 - 0
examples/test/bin/None-2020-11-25要素标注统计.xls → examples/coreline/bin/None-2020-11-25要素标注统计.xls


+ 0 - 0
examples/test/bin/None-2020-12-25要素标注统计.xls → examples/coreline/bin/None-2020-12-25要素标注统计.xls


+ 0 - 0
examples/coreline/bin/__init__.py


+ 0 - 0
examples/test/bin/csv_to_iepy.py → examples/coreline/bin/csv_to_iepy.py


+ 0 - 0
examples/test/bin/gazettes_loader.py → examples/coreline/bin/gazettes_loader.py


+ 0 - 0
examples/test/bin/iepy_rules_runner.py → examples/coreline/bin/iepy_rules_runner.py


+ 0 - 0
examples/test/bin/iepy_runner.py → examples/coreline/bin/iepy_runner.py


+ 0 - 0
examples/test/bin/manage.py → examples/coreline/bin/manage.py


+ 0 - 0
examples/test/bin/preprocess.py → examples/coreline/bin/preprocess.py


+ 0 - 0
examples/test/bin/rules_verifier.py → examples/coreline/bin/rules_verifier.py


+ 41 - 4
examples/test/bin/settlement.py → examples/coreline/bin/settlement.py

@@ -243,9 +243,46 @@ class Settlement():
 if __name__=="__main__":
     settle = Settlement()
     # settle.makeMigrate("test","2020-08-01","2020-08-31")
-    settle.makePayroll(["test3","test19","test22","test2","test9","test11","test12","test1","test7","test21","test17"],"2020-08-01","2020-12-25")
-    # settle.makePayrolls("2020-08-01","2020-08-31")
-    settle.exportPayroll(begin_time=None,end_time='2020-12-25')
+    # settle.makePayroll(["test3","test19","test22","test2","test9","test11","test12","test1","test7","test21","test17"],"2020-08-01","2020-12-25")
+    # settle.exportPayroll(begin_time=None,end_time='2020-12-25')
     # settle.createUser_batch(batch_size=102)
     # settle.exportLabels()
-    # settle.filter()
+    # settle.filter()
+
+    from brat.models import BratAnnotation as brat_annotations
+    from iepy.webui.corpus.models import IEDocument
+    filename = '74800260'
+    pre_label = IEDocument.objects.filter(human_identifier='74800260').values("pre_label")
+    _label = ""
+    if len(pre_label)>0:
+        if pre_label[0]["pre_label"] is not None:
+            _label = pre_label[0]["pre_label"]
+    dict_T = dict()
+    dict_R = dict()
+    set_pre_label = set()
+    for _i in _label.split(";"):
+        if _i!="":
+            set_pre_label.add(_i)
+    set_label = set()
+    anns = brat_annotations.objects.filter(document_id='74800260').values("value")
+    for _str in anns:
+        _str = _str["value"].strip()
+        if _str != "":
+            if _str[0]=="T":
+                match = re.search(T_pattern,_str)
+                if match is not None:
+                    match = match.groupdict()
+                    dict_T[match["T"]] = {"type":match["type"],"begin":match["begin"],"end":match["end"]}
+            if _str[0]=="R":
+                match = re.search(R_pattern,_str)
+                if match is not None:
+                    match = match.groupdict()
+                    dict_R[match["R"]] = {"type":match["type"],"arg1":match["arg1"],"arg2":match["arg2"]}
+    for _T,_v in dict_T.items():
+        set_label.add("T|%s|%d|%d"%(_v["type"],int(_v["begin"]),int(_v["end"])))
+    for _R,_v in dict_R.items():
+        set_label.add("R|%s|%d|%d|%d|%d"%(_v["type"],int(dict_T[_v["arg1"]]["begin"]),int(dict_T[_v["arg1"]]["end"]),int(dict_T[_v["arg2"]]["begin"]),int(dict_T[_v["arg2"]]["end"])))
+    union_set = set_pre_label&set_label
+    deleted = len(set_pre_label)-len(union_set)
+    added = len(set_label)-len(union_set)
+    print(deleted,added)

+ 0 - 0
examples/test/extractor_config.json → examples/coreline/extractor_config.json


+ 20 - 0
examples/coreline/gunicorn_django.py

@@ -0,0 +1,20 @@
+
+
+# gunicorn_config.py
+import logging
+import logging.handlers
+from logging.handlers import WatchedFileHandler
+import os
+import multiprocessing
+bind = '127.0.0.1:8001'      #绑定ip和端口号
+backlog = 512                #监听队列
+# chdir = '/label/iepy-develop/examples/coreline/bin'  #gunicorn要切换到的目的工作目录
+timeout = 30      #超时
+worker_class = 'gevent' #使用gevent模式,还可以使用sync 模式,默认的是sync模式
+
+workers = multiprocessing.cpu_count() * 2 + 1    #进程数
+threads = 2 #指定每个进程开启的线程数
+loglevel = 'info' #日志级别,这个日志级别指的是错误日志的级别,而访问日志的级别无法设置
+access_log_format = '%(t)s %(p)s %(h)s "%(r)s" %(s)s %(L)s %(b)s %(f)s" "%(a)s"'
+accesslog = "/label/iepy-develop/examples/coreline/log/gunicorn_access.log"      #访问日志文件
+errorlog = "/label/iepy-develop/examples/coreline/log/gunicorn_error.log"        #错误日志文件

+ 0 - 0
examples/test/rules.py → examples/coreline/rules.py


+ 7 - 0
examples/test/settings.py → examples/coreline/settings.py

@@ -15,6 +15,11 @@ SECRET_KEY = 'u==!fueit=wxo&j8!5u+sfasp4prjluk@*s=7!-wz_&r@pn))r'
 DEBUG = True
 TEMPLATE_DEBUG = True
 
+STATIC_URL = '/static/'
+
+# BASE_DIR 是项目的绝对地址
+STATIC_ROOT = os.path.join(BASE_DIR, 'static')
+
 # Database
 # https://docs.djangoproject.com/en/1.7/ref/settings/#databases
 # DATABASES = {
@@ -54,6 +59,7 @@ Disallow: /confidential/
 """,
     "annotation.conf":"""
 [spans]
+product
 code
 name
 money
@@ -158,6 +164,7 @@ time_bidclose | 截标时间
 moneysource | 资金来源
 bidway | 招标方式
 serviceTime | 服务期限
+product | 产品
 
 #Protein | Protein | Pro | P
 #Protein_binding | Protein binding | Binding | Bind

+ 0 - 0
examples/test/test.sqlite → examples/coreline/test.sqlite


+ 7 - 0
examples/coreline/wsgi.py

@@ -0,0 +1,7 @@
+import os
+
+from django.core.wsgi import get_wsgi_application
+
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "coreline.settings")
+
+application = get_wsgi_application()

+ 69 - 0
examples/nginx.conf

@@ -0,0 +1,69 @@
+
+#user  nobody;
+worker_processes  10;
+
+#error_log  logs/error.log;
+#error_log  logs/error.log  notice;
+#error_log  logs/error.log  info;
+
+#pid        logs/nginx.pid;
+
+
+events {
+    worker_connections  1024;
+}
+
+
+http {
+    include      mime.types;
+    default_type  application/octet-stream;
+
+    log_format  access   '{"ip":"$remote_addr", "time":"[$time_iso8601]" ,"path":"$request","infosoruce":"$http_user_agent"}';
+
+    #access_log  logs/ip.log access;
+
+    sendfile        on;
+    #tcp_nopush     on;
+
+    #keepalive_timeout  0;
+    keepalive_timeout  65;
+
+    gzip  on;
+
+
+    client_max_body_size 8M;
+    client_body_buffer_size 128k;
+
+	upstream mysvr {   
+	  server 127.0.0.1:8002;
+	}
+
+    server {
+		# 端口和域名
+		listen 8000;
+		server_name 127.0.0.1;
+
+		# 日志
+		access_log /label/iepy-develop/examples/coreline/log/gunicorn_access.log;
+		error_log /label/iepy-develop/examples/coreline/log/gunicorn_error.log;
+
+		# 不记录访问不到 favicon.ico 的报错日志
+		#location = /favicon.ico { access_log off; log_not_found off; }
+		
+		
+		location /static/ {
+			root /label/iepy-develop/iepy/webui;
+		}
+		location /media/ {
+			root /home/wardseptember/django-blog;
+		}
+		# gunicorn 中生成的文件的地址
+		location / {
+			proxy_pass  http://mysvr;  #请求转向mysvr 定义的服务器列表
+		}
+	}
+
+
+
+
+}

+ 5 - 0
examples/start.md

@@ -0,0 +1,5 @@
+
+#启动服务
+gunicorn -w 5 --preload -b 0.0.0.0:8002 coreline.wsgi
+#启动nginx
+/usr/local/nginx/sbin/nginx -s reload

二進制
examples/test/bin/分组_1.xls


二進制
examples/test/bin/分组_10.xls


二進制
examples/test/bin/分组_2.xls


二進制
examples/test/bin/分组_3.xls


二進制
examples/test/bin/分组_4.xls


二進制
examples/test/bin/分组_5.xls


二進制
examples/test/bin/分组_6.xls


二進制
examples/test/bin/分组_7.xls


二進制
examples/test/bin/分组_8.xls


二進制
examples/test/bin/分组_9.xls


+ 13 - 8
iepy/data/models.py

@@ -37,9 +37,9 @@ class EntityKind(BaseModel):
         return self.name
 
 class Payroll(BaseModel):
-    user = models.CharField(max_length=CHAR_MAX_LENGHT)
-    begin_time = models.CharField(max_length=10)
-    end_time = models.CharField(max_length=10)
+    user = models.CharField(max_length=CHAR_MAX_LENGHT,db_index=True)
+    begin_time = models.CharField(max_length=10,db_index=True)
+    end_time = models.CharField(max_length=10,db_index=True)
     doc_count = models.IntegerField()
     t_count = models.IntegerField()
     r_count = models.IntegerField()
@@ -101,18 +101,23 @@ class IEDocument(BaseModel):
                                     on_delete=models.PROTECT)
     human_identifier = models.CharField(
         max_length=CHAR_MAX_LENGHT,
-        unique=True
+        unique=True,
+        db_index=True
     )
 
     sourcetext = models.TextField(null=True)
-    edituser = models.TextField(null=True)
-    edittime = models.DateTimeField(null=True,blank=True)
-    reedittime = models.DateTimeField(null=True,blank=True)
-    brat_done_at = models.DateTimeField(null=True, blank=True)
+    edituser = models.TextField(null=True,db_index=True)
+    edittime = models.DateTimeField(null=True,blank=True,db_index=True)
+    reedittime = models.DateTimeField(null=True,blank=True,db_index=True)
+    brat_done_at = models.DateTimeField(null=True, blank=True,db_index=True)
 
     text = models.TextField()
     creation_date = models.DateTimeField(auto_now_add=True)
 
+    pre_label = models.TextField(blank=True)
+    deleted = models.IntegerField(default=0)
+    added = models.IntegerField(default=0)
+
     # The following 3 lists have 1 item per token
     tokens = ListField(blank=True)  # strings
     lemmas = ListField(blank=True)  # strings

二進制
iepy/selfpreprocess/BiddingKG/dl/complaint/models/punish_code.pb


+ 473 - 0
iepy/selfpreprocess/BiddingKG/dl/complaint/punish_predictor.py

@@ -0,0 +1,473 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/25 0025 16:35 
+
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2020/12/24 0024 15:23
+import re
+import os
+import time
+import tensorflow as tf
+# from BiddingKG.dl.common.Utils import *
+from tensorflow.contrib.crf import crf_log_likelihood
+from tensorflow.contrib.layers.python.layers import initializers
+# from keras.preprocessing.sequence import pad_sequences
+# import BiddingKG.dl.interface.Preprocessing as Preprocessing
+from BiddingKG.dl.interface.Preprocessing import *
+
+
+def decode(logits, trans, sequence_lengths, tag_num):
+    viterbi_sequences = []
+    for logit, length in zip(logits, sequence_lengths):
+        score = logit[:length]
+        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
+        viterbi_sequences.append(viterbi_seq)
+    return viterbi_sequences
+
+class Punish_Extract():
+    def __init__(self, model_file = os.path.dirname(__file__)+"/models/punish_code.pb"):
+        print('model_file_path:',model_file)
+        self.sess = tf.Session(graph=tf.Graph())
+        self.code = ""
+        self.punish_dicition = ""
+        self.model_file = model_file #预测编号模型
+        self.load_model()
+
+    # 加载处罚编号预测模型
+    def load_model(self):
+        log("get model of time")
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                output_graph_def = tf.GraphDef()
+                with open(self.model_file, 'rb') as f:
+                    output_graph_def.ParseFromString(f.read())
+                    tf.import_graph_def(output_graph_def, name="")
+                    self.sess.run(tf.global_variables_initializer())
+                    self.char_input = self.sess.graph.get_tensor_by_name("char_input:0")
+                    self.length = self.sess.graph.get_tensor_by_name("length:0")
+                    self.trans = self.sess.graph.get_tensor_by_name("crf_loss/transitons:0")
+                    self.logits = self.sess.graph.get_tensor_by_name("CRF/output/logits:0")
+
+    # 处罚编号预测
+    def predict_punishCode(self,list_sentences, MAX_AREA=5000):
+        '''
+        每个句子预测处罚编号
+        :param list_sentences: 多篇文章句子列表[[每篇文章句子列表]]
+        :param MAX_AREA: 控制最大每个句子长度,超过截断
+        :return: 处罚编号字符串,若有多个;号隔开
+        '''
+        re_ner = re.compile("12+?3")
+        article_ner_list = []
+        count = 0
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                for sentences in list_sentences:
+                    count += 1
+                    # print(count)
+                    sentences.sort(key=lambda x: len(x.sentence_text), reverse=True)
+                    _begin_index = 0
+                    while True:
+                        MAX_LEN = len(sentences[_begin_index].sentence_text)
+                        if MAX_LEN > MAX_AREA:
+                            MAX_LEN = MAX_AREA
+                        _LEN = MAX_AREA // MAX_LEN
+                        sentence_len = [len(sentence.sentence_text) for sentence in sentences[_begin_index:_begin_index+_LEN]]
+                        sentences_x = []
+                        for sentence in sentences[_begin_index:_begin_index+_LEN]:
+                            sentence = sentence.sentence_text
+                            sentence = list(sentence)
+                            sentence2id = [getIndexOfWord(word) for word in sentence]
+                            sentences_x.append(sentence2id)
+                        sentences_x = pad_sequences(sentences_x, maxlen=MAX_LEN, padding="post", truncating="post")
+                        sentences_x = [np.array(x) for x in sentences_x]
+                        _logits, _trans = self.sess.run([self.logits, self.trans],
+                                                   feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
+                        viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
+
+                        ner_list = []
+                        for _seq, sentence in zip(viterbi_sequence, sentences[_begin_index:_begin_index+_LEN]):
+                            sentence = sentence.sentence_text
+                            seq_id = ''.join([str(s) for s in _seq])
+                            if re_ner.search(seq_id):
+                                # print("sentence: ",sentence)
+                                for _ner in re_ner.finditer(seq_id):
+                                    start = _ner.start()
+                                    end = _ner.end()
+                                    n = sentence[start:end]
+                                    # print(n,'<==>',start,end)
+                                    # ner_list.append((n, start, end))
+                                    ner_list.append(n)  # 改为只返回实体字符
+                        # article_ner_list.append(ner_list)
+                        article_ner_list.append(';'.join(set(ner_list)))
+                        if _begin_index+_LEN >= len(sentences):
+                            break
+                        _begin_index += _LEN
+        return article_ner_list[0]
+
+    # 处罚类型
+    def get_punishType(self, x1, x2):
+        '''通过文章标题及内容判断文章类别
+        x1: 标题
+        x2: 内容
+        return 类别'''
+        # x1 = x1.replace('(','(').replace(')', ')').replace(' ','')
+        # x2 = x2.replace('(', '(').replace(')', ')').replace(' ', '')
+        '''标题正则'''
+        # 未知公告
+        unknow = re.compile('采购方式|采购公告|采购招标|磋商公告|谈判公告|交易公告$|征集|征求|招标公告|竞标公告|中标公告|'
+                            '成交公告|成交信息|流标公告|废标公告|城市管理考评|决算表|决算|预算|资格考试|招聘|选聘'
+                            '|聘请|拟录用|无违规违法|无此项信息|暂无工程投标违法|管理办法|指导意见|无投诉|投诉办法'
+                            '公共资源交易情况|绩效评价|考试成绩|付息公告|不动产|办证|印发|转发')  #|结果公示 部分是
+        # 投诉处理
+        tscl = re.compile('投诉不予[处受]理|投诉不成立|终止投诉|投诉终止|不予受理|投诉事?项?的?处理')
+        # 行政处罚
+        xzcf = re.compile('行政处罚|行政处理|政处罚|行政裁决|防罚|公罚|医罚|环罚|政罚|文罚|局罚|旅罚|财罚|运罚')
+        # 监督检查
+        jdjc = re.compile('(监督检查的?问?题?(处理|整改|记分|结果|决定|处罚))|监督处罚|调查处理|监督处理')
+        # 严重违法
+        yzwf = re.compile('严重违法失信|黑名单|失信名单')
+        # 不良行为
+        blxw = re.compile('((不良|失信|不诚信|差错|不规范|违规|违约|处罚|违法)(行为|记录|信息))|((违约|违规|违法)(处理|操作|情况|问题))'
+                          '|通报批评|记分管理|迟到|早退|缺席|虚假材料|弄虚作假|履职不到位|诚信考核扣分|串通投标'
+                          '|审核不通过|码一致|地址一致|扣分处理|扣分通知|扣[0-9]+分|责令整改|信用信息认定书$'
+                          '|关于.{,30}的处罚|关于.{,10}的?考评通报|关于.{,30}扣分情况|不规范代理行为'
+                          '|(取消|暂停|限制).{,50}((专家|评标|评委|投标|竞价|被抽取|中标|供应商|候选人)资格)'
+                          '|(代理服?务?机构).{,10}(扣分)|(专家).{,30}(扣分|记分|处罚)|对.{,30}处理|冻结.{,30}账号')
+        # 其他不良行为
+        other = re.compile('质疑|代理机构进场交易情况|网上投诉办理|信用奖惩|信用奖罚|进场工作.{,5}考核'
+                           '|举报处理|结果无效|成交无效|行政复议')
+
+        '''正文内容正则'''
+        # 投诉处理
+        tscl_c = re.compile('(投诉(人|单位)[1-9]?(名称)?[::])|(投诉事项[1-5一二三四五、]*部?分?(成立|予以受理))'
+                            '|((驳回|撤回|撤销|终止)[^,。]{,60}(投诉|质疑))')
+        # 行政处罚
+        xzcf_c = re.compile('((处理依据及结果|处理结果|处罚结果)).*行政处罚|如下行政处罚|行政处罚决定')
+        # 诚信加分
+        cxjf_c = re.compile('处罚结果.*诚信加分')
+        # 严重违法失信
+        yzwf_c = re.compile('工商部门严重违法失信起名单|严重违法失信的具体情形') #|严重违法失信的具体情形
+        # 不良行为
+        blxw_c = re.compile('(取消|暂停|限制).{,30}((专家|评标|评委|投标|采购|竞价|被抽取|中标|供应商)的?资格)'
+                            '|(处罚结果|处罚情况).*(扣[1-9]*分|记分|不良行为|不良记录|不良信用|不诚信|扣除信用'
+                            '|诚信档案|信用信息|取消.*资格|口头警告|处罚机关|责令改正|罚款|限制投标|暂扣|禁止'
+                            '|暂停|封禁|暂无|行政处罚)|处罚结果'
+                            '|处罚主题|禁止参与.{,10}政府采购活动|列入不良行为|处罚如下|如下处罚|违规处罚|处罚违规'
+                            '|责令改正|责令整改|处罚依据|进行以下处理|处理依据及结果|处理结果|处罚决定书|'
+                            '(不规范|不良|不诚信)行为记录')
+        # 其他不良行为
+        other_c = re.compile('质疑(人|单位)[1-9]?(名称)?:|公告期内受质疑')
+
+        if re.search(unknow, x1):
+            return re.search(unknow, x1).group(0), '未知类别'
+        elif re.search(yzwf, x1):
+            return re.search(yzwf, x1).group(0), '严重违法'
+        elif re.search(yzwf_c, x2):
+            return re.search(yzwf_c, x2).group(0), '严重违法'
+
+        elif re.search(tscl, x1):
+            return re.search(tscl, x1).group(0), '投诉处理'
+        elif re.search(xzcf, x1):
+            return re.search(xzcf, x1).group(0), '行政处罚'
+        elif re.search(jdjc, x1):
+            return re.search(jdjc, x1).group(0), '监督检查'
+        elif re.search(blxw, x1):
+            return re.search(blxw, x1).group(0), '不良行为'
+        elif re.search(other, x1):
+            return re.search(other, x1).group(0), '其他不良行为'
+
+        elif re.search(tscl_c, x2):
+            return re.search(tscl_c, x2).group(0), '投诉处理'
+        elif re.search(xzcf_c, x2):
+            return re.search(xzcf_c, x2).group(0), '行政处罚'
+        elif re.search(cxjf_c, x2):
+            return re.search(cxjf_c, x2).group(0), '诚信加分'
+
+        elif re.search(blxw_c, x2):
+            return re.search(blxw_c, x2).group(0), '不良行为'
+        elif re.search(other_c, x2):
+            return re.search(other_c, x2).group(0), '其他不良行为'
+
+        return ' ', '未知类别'
+
+    # 处罚决定
+    def get_punishDecision(self, x, x2):
+        '''通过正则匹配文章内容中的处理决定
+        x:正文内容
+        x2: 处罚类别
+        return 处理决定字符串'''
+        rule1 = re.compile(
+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处理意见|行政处罚|处罚)(如下|如下))'
+            '|((以下|如下)(决定|处理|处理意见|行政处罚|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
+            '|整改意见)[::].{5,}')
+        rule2 = re.compile(
+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处罚|处理意见)(如下|如下))'
+            '|((以下|如下)(决定|处理|处理意见|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
+            '|处罚内容)[:,,].{10,}')
+        rule3 = re.compile('考评结果:?.*')
+        rule4 = re.compile('(依据|根据)《.*》.*')
+        if x2 == '未知类别':
+            return ' '
+        elif re.search(rule1, x[-int(len(x)*0.4):]):
+            return re.search(rule1, x[-int(len(x)*0.4):]).group(0)
+        elif re.search(rule1, x[-int(len(x)*0.6):]):
+            return re.search(rule1, x[-int(len(x)*0.6):]).group(0)
+        elif re.search(rule2, x[-int(len(x)*0.7):]):
+            return re.search(rule2, x[-int(len(x)*0.7):]).group(0)
+        elif re.search(rule3, x[-int(len(x)*0.6):]):
+            return re.search(rule3, x[-int(len(x)*0.6):]).group(0)
+        elif re.search(rule4, x[-int(len(x)*0.4):]):
+            return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
+        else:
+            return ' '
+
+    # 投诉是否成立
+    def get_punishWhether(self, x1, x2, x3):
+        '''通过正则匹配处理决定判断投诉是否成立
+        x1: 处理决定字符串
+        x2: 正文内容
+        x3: 处罚类别
+        return 投诉是否成立'''
+        p1 = re.compile('(投诉|投拆|质疑|举报)(事项|内容|事实)?[^不,。]{,10}(成立|属实|予以受理|予以支持)|责令|废标|(中标|成交)[^,。]{,10}无效'
+                        '|取消[^,。]{,60}资格|罚款|重新(组织|开展)?(招标|采购)|投诉成立|被投诉人存在违法违规行为'
+                        '|采购活动违法|(中标|评标|成交)结果无效')
+        p2 = re.compile('投诉不予[处受]理|((投诉|投拆|质疑|举报)(事项|内容|事实)?[^,。]{,10}(不成立|情?况?不属实|不予支持|缺乏事实依据))'
+                        '|((驳回|撤回|撤销|终止)[^,。]*(投诉|质疑|诉求))|终止[^,。]{,20}(行政裁决|投诉处理|采购活动)|投诉终止|投诉无效'
+                        '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^,。]{,10}不成立'
+                        '|维持被投诉人|不支持[^,。]{,20}投诉|无确凿证据')
+        if x3 != '投诉处理':
+            return ' '
+        elif re.search(p1, x1):
+            return '投诉成立'
+        elif re.search(p2, x1):
+            return '投诉无效'
+        elif re.search(p1, x2):
+            return '投诉成立'
+        elif re.search(p2, x2):
+            return '投诉无效'
+        return ' '
+
+    # 执法机构、处罚时间
+    def get_institution(self, title, sentences_l, entity_l):
+        '''
+        通过判断实体前信息判断改实体是否为执法机构
+        :param title: 文章标题
+        :param sentences_l: 单篇公告句子列表
+        :param entity_l: 单篇公告实体列表
+        :return: 执法机构及处罚时间字符串,多个的用;号隔开
+        '''
+        institutions = []
+        punishTimes = []
+        institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
+        punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
+        # 通过实体前面关键词判断是否为执法机构或处罚时间
+        for ner in entity_l:
+            if ner.entity_type == 'org':
+                left = sentences_l[ner.sentence_index].sentence_text[
+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
+                if institution_1.search(left):
+                    institutions.append(ner)
+                elif institutions != [] and ner.sentence_index == institutions[-1].sentence_index and \
+                        ner.wordOffset_begin - institutions[-1].wordOffset_end < 2 and \
+                        sentences_l[ner.sentence_index].sentence_text[
+                        ner.wordOffset_begin:institutions[-1].wordOffset_end] \
+                        in ['', '、', '和', '及']:
+                    institutions.append(ner)
+            elif ner.entity_type == 'time':
+                left = sentences_l[ner.sentence_index].sentence_text[
+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
+                if punishTimes_1.search(left):
+                    punishTimes.append(ner)
+
+        institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
+        institution_time = re.compile(
+            "(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
+        ins = ""
+        ptime = ""
+        # 如果前面步骤找不到处罚机构则在标题找实体,并正则检查是否有关键词
+        if institutions == [] and len(title)>10:
+            title_ners = getNers([title], useselffool=True)
+            if title_ners[0]:
+                for title_ner in title_ners[0]:
+                    if title_ner[2] == 'org' and institution_title.search(title_ner[3]):
+                        ins = title_ner[3]
+                        break
+        if punishTimes == [] or institutions == []:
+            # 如果前面步骤还没找到要素,则通过公司实体后面是否有日期关键词,有则作为处罚机构和处罚时间
+            for ner in [ner for ner in entity_l if ner.entity_type == 'org'][-5:][::-1]:
+                right = sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_end:ner.wordOffset_end + 16]
+                if institution_time.search(right):
+                    if ins == '':
+                        ins = ner.entity_text
+                    if ptime == '':
+                        ptime = institution_time.search(right).group(1)
+                    break
+            # 前面步骤都没找到则判断最后一个时间实体是否在文章末尾,是则作为处罚时间
+            if ptime == '':
+                n_time = [ner for ner in entity_l if ner.entity_type == 'time']
+                if len(n_time) != 0:
+                    ner = n_time[-1]
+                    if ner.sentence_index == len(sentences_l) - 1:
+                        textLong = len(sentences_l[ner.sentence_index].sentence_text)
+                        if ner.wordOffset_end > textLong - 3 and len(ner.entity_text) > 3:
+                            ptime = ner.entity_text
+        institutions = [ner.entity_text for ner in institutions]
+        punishTimes = [ner.entity_text for ner in punishTimes]
+        if institutions == [] and ins != "":
+            institutions.append(ins)
+        if punishTimes == [] and ptime != "":
+            punishTimes.append(ptime)
+        return ";".join(institutions), ";".join(punishTimes)
+
+    # 投诉人、被投诉人、被处罚人
+    def get_complainant(self, punishType, sentences_l, entity_l):
+        '''
+        通过对公告类别、句子列表、实体列表正则寻找投诉人、被投诉人、处罚人
+        :param punishType: 公告处罚类别
+        :param sentences_l: 单篇公告句子列表
+        :param entity_l: 单篇公告实体列表
+        :return: 投诉人、被投诉人
+        '''
+        complainants = []  # 投诉人
+        punishPeople = []  # 被投诉人、被处罚人
+        size = 16
+        # 投诉人、质疑人
+        complainants_rule1 = re.compile(
+            "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+        # 被处罚人,被投诉人
+        punishPeople_rule1 = re.compile(
+            "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+        punishPeople_rule2_1 = re.compile(",$")
+        punishPeople_rule2_2 = re.compile("^[::]")
+        punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
+        punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
+
+        punish_l = []  # 处罚实体列表
+        tmp = []
+        for ner in [ner for ner in entity_l if ner.entity_type in ['org', 'company', 'person']]:
+            if tmp == []:
+                tmp.append(ner)
+            elif ner.entity_type == tmp[-1].entity_type and ner.sentence_index == tmp[-1].sentence_index and \
+                    ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
+                '',
+                '、',
+                '和',
+                '及']:
+                tmp.append(ner)
+            elif ner.entity_type in ['org', 'company'] and tmp[-1].entity_type in ['org', 'company'] and \
+                    ner.sentence_index == tmp[-1].sentence_index and ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
+                '',
+                '、',
+                '和',
+                '及']:
+                tmp.append(ner)
+            else:
+                punish_l.append(tmp)
+                tmp = [ner]
+        for ner_l in punish_l:
+            begin_index = ner_l[0].wordOffset_begin
+            end_index = ner_l[-1].wordOffset_end
+            left = sentences_l[ner_l[0].sentence_index].sentence_text[max(0, begin_index - size):begin_index]
+            right = sentences_l[ner_l[0].sentence_index].sentence_text[end_index:end_index + size]
+            if complainants_rule1.search(left):
+                complainants.append(ner_l)
+            elif punishPeople_rule1.search(left):
+                punishPeople.append(ner_l)
+            elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
+                if punishType == '投诉处理':
+                    complainants.append(ner_l)
+                else:
+                    punishPeople.append(ner_l)
+            elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
+                punishPeople.append(ner_l)
+        complainants = set([it.entity_text for l in complainants for it in l])
+        punishPeople = set([it.entity_text for l in punishPeople for it in l])
+        return ';'.join(complainants), ';'.join(punishPeople)
+
+    def get_punish_extracts(self,list_articles,list_sentences, list_entitys):
+        list_result = []
+        for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
+            title = article.title
+            text=article.content
+            keyword, punishType = self.get_punishType(title, text)
+
+            # print('处罚类型:',punishType)
+            punish_code = self.predict_punishCode(list_sentences)
+            # print('处罚编号: ',punish_code)
+            institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
+            # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
+            punishDecision = self.get_punishDecision(text, punishType)
+            # print('处罚决定:',punishDecision)
+            punishWhether= self.get_punishWhether(punishDecision, text, punishType)
+            # print('投诉是否成立:',punishWhether)
+            complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
+            # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
+            punish_dic = {'punish_code':punish_code,
+                          'punishType':punishType,
+                          'punishDecision':punishDecision,
+                         'complainants':complainants,
+                         'punishPeople':punishPeople,
+                         'punishWhether':punishWhether,
+                         'institutions':institutions,
+                         'punishTimes':punishTimes}
+            _count = 0
+            for k,v in punish_dic.items():
+                if v!="":
+                    _count += 1
+            if _count>=2 and punish_dic["punishType"]!="未知类别":
+                list_result.append({"punish":punish_dic})
+            else:
+                list_result.append({"punish":{}})
+        return list_result
+
+
+
+if __name__ == "__main__":
+    punish = Punish_Extract()
+
+    import pandas as pd
+    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
+    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
+    # i = 89
+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+    # i = 92
+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+
+    # t1 = time.time()
+    # for i in df.index:
+    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
+    #     df.loc[i, '投诉人'] = complainants
+    #     df.loc[i, '被投诉人'] = punishPeople
+    #     df.loc[i, '执法机构'] = institutions
+    #     df.loc[i, '处罚时间'] = punishTimes
+    #     df.loc[i, '处罚编号'] = punish_code
+    #     print('完成第%d篇'%i)
+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
+    # #    'institution', 'punishTime', 'ner_test']])
+    # t2 = time.time()
+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
+    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
+    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
+    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
+    # t3 = time.time()
+    # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
+    s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
+    # list_sentences = [s.split('。')]
+    # punish_code= punish.predict_punishCode( list_sentences)
+    # print(punish_code)
+
+    # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    #             get_punish_extracts(text=s)
+    # punish_dic = punish.get_punish_extracts(text=s)
+    # print(punish_dic)

+ 98 - 70
iepy/selfpreprocess/BiddingKG/dl/complaint/punish_rule.py

@@ -75,6 +75,7 @@ def BiLSTM_CRF_tfmodel(sess,weights):
             grads_vars = opt.compute_gradients(crf_loss)
             capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
             train_op = opt.apply_gradients(capped_grads_vars,global_step)
+            print('tensor: ',char_input, length, trans, _logits)
             return char_input,_logits,target,length,crf_loss,trans,train_op
 
 def decode(logits, trans, sequence_lengths, tag_num):
@@ -125,6 +126,7 @@ class Punish_Extract():
                         sentences_x.append(sentence2id)
                     sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
                     sentences_x = [np.array(x) for x in sentences_x]
+                    print('punish tensor: ',self.logits, self.trans, self.char_input, self.length)
                     _logits, _trans = self.sess.run([self.logits, self.trans],
                                                feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
                     viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
@@ -143,7 +145,7 @@ class Punish_Extract():
                                 # ner_list.append((n, start, end))
                                 ner_list.append(n)  # 改为只返回实体字符
                     # article_ner_list.append(ner_list)
-                    article_ner_list.append(''.join(set(ner_list)))
+                    article_ner_list.append(';'.join(set(ner_list)))
         return article_ner_list[0]
 
     # 处罚类型
@@ -261,7 +263,7 @@ class Punish_Extract():
         elif re.search(rule4, x[-int(len(x)*0.4):]):
             return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
         else:
-            return ' '
+            return ''
 
     # 投诉是否成立
     def get_punishWhether(self, x1, x2, x3):
@@ -278,7 +280,7 @@ class Punish_Extract():
                         '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^,。]{,10}不成立'
                         '|维持被投诉人|不支持[^,。]{,20}投诉|无确凿证据')
         if x3 != '投诉处理':
-            return ' '
+            return ''
         elif re.search(p1, x1):
             return '投诉成立'
         elif re.search(p2, x1):
@@ -287,7 +289,7 @@ class Punish_Extract():
             return '投诉成立'
         elif re.search(p2, x2):
             return '投诉无效'
-        return ' '
+        return ''
 
     # 执法机构、处罚时间
     def get_institution(self, title, sentences_l, entity_l):
@@ -296,7 +298,7 @@ class Punish_Extract():
         :param title: 文章标题
         :param sentences_l: 单篇公告句子列表
         :param entity_l: 单篇公告实体列表
-        :return: 执法机构及处罚时间字符串,多个的用号隔开
+        :return: 执法机构及处罚时间字符串,多个的用;号隔开
         '''
         institutions = []
         punishTimes = []
@@ -359,7 +361,7 @@ class Punish_Extract():
             institutions.append(ins)
         if punishTimes == [] and ptime != "":
             punishTimes.append(ptime)
-        return ";".join(institutions), ";".join(punishTimes)
+        return ";".join(institutions), ";".join(punishTimes)
 
     # 投诉人、被投诉人、被处罚人
     def get_complainant(self, punishType, sentences_l, entity_l):
@@ -426,7 +428,7 @@ class Punish_Extract():
                 punishPeople.append(ner_l)
         complainants = set([it.entity_text for l in complainants for it in l])
         punishPeople = set([it.entity_text for l in punishPeople for it in l])
-        return ';'.join(complainants), ';'.join(punishPeople)
+        return ';'.join(complainants), ';'.join(punishPeople)
 
     def get_punish_extracts_backup(self, doc_id=' ', title=' ', text=' '):
         list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
@@ -459,73 +461,99 @@ class Punish_Extract():
         for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
             title = article.title
             text=article.content
+
             keyword, punishType = self.get_punishType(title, text)
-            if punishType == "未知类别":
-                list_result.append({"punish":{}})
-            else:
-                # print('处罚类型:',punishType)
-                punish_code = self.predict_punishCode(list_sentences)
-                # print('处罚编号: ',punish_code)
-                institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
-                # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
-                punishDecision = self.get_punishDecision(text, punishType)
-                # print('处罚决定:',punishDecision)
-                punishWhether= self.get_punishWhether(punishDecision, text, punishType)
-                # print('投诉是否成立:',punishWhether)
-                complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
-                # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
-                punish_dic = {'punish_code':punish_code,
-                              'punishType':punishType,
-                              'punishDecision':punishDecision,
-                             'complainants':complainants,
-                             'punishPeople':punishPeople,
-                             'punishWhether':punishWhether,
-                             'institutions':institutions,
-                             'punishTimes':punishTimes}
+            # print('处罚类型:',punishType)
+            punish_code = self.predict_punishCode(list_sentences)
+            # print('处罚编号: ',punish_code)
+            institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
+            # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
+            punishDecision = self.get_punishDecision(text, punishType)
+            # print('处罚决定:',punishDecision)
+            punishWhether= self.get_punishWhether(punishDecision, text, punishType)
+            # print('投诉是否成立:',punishWhether)
+            complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
+            # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
+            punish_dic = {'punish_code':punish_code,
+                          'punishType':punishType,
+                          'punishDecision':punishDecision,
+                         'complainants':complainants,
+                         'punishPeople':punishPeople,
+                         'punishWhether':punishWhether,
+                         'institutions':institutions,
+                         'punishTimes':punishTimes}
+            _count = 0
+            for k,v in punish_dic.items():
+                if v!="":
+                    _count += 1
+            if _count>=2 and punish_dic["punishType"]!="未知类别":
                 list_result.append({"punish":punish_dic})
+            else:
+                list_result.append({"punish":{}})
         return list_result
 
-if __name__ == "__main__":
-    punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
+def save_punish_code_model():
+    model_folder = os.path.dirname(__file__) + "/models/21-0.9990081295021194-0.3647936"
+    output_graph = os.path.dirname(__file__) + "/models/punish_code.pb"
+    ckpt = tf.train.get_checkpoint_state(model_folder)
+    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
+        input_checkpoint = ckpt.model_checkpoint_path
+        saver = tf.train.import_meta_graph(input_checkpoint+".meta", clear_devices=True)
+        graph = tf.get_default_graph()
+        input_graph_def = graph.as_graph_def()
+        with tf.Session() as sess:
+            saver.restore(sess, input_checkpoint)
+            output_graph_def = graph_util.convert_variables_to_constants(
+                sess = sess,
+                input_graph_def = input_graph_def,
+                output_node_names=["char_input","length","crf_loss/transitons","CRF/output/logits"]
+            )
+            with tf.gfile.GFile(output_graph, "wb") as f:
+                f.write(output_graph_def.SerializeToString())
 
-    import pandas as pd
-    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
-    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
-    # i = 89
-    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
-    # i = 92
-    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
 
-    # t1 = time.time()
-    # for i in df.index:
-    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
-    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
-    #     df.loc[i, '投诉人'] = complainants
-    #     df.loc[i, '被投诉人'] = punishPeople
-    #     df.loc[i, '执法机构'] = institutions
-    #     df.loc[i, '处罚时间'] = punishTimes
-    #     df.loc[i, '处罚编号'] = punish_code
-    #     print('完成第%d篇'%i)
-    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
-    # #     '关键词', '类别', '处理决定', '投诉是否成立',
-    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
-    # #    'institution', 'punishTime', 'ner_test']])
-    # t2 = time.time()
+if __name__ == "__main__":
+    save_punish_code_model()
+    # punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
+    #
+    # import pandas as pd
+    # # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
+    # df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
+    # # i = 89
+    # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+    # # i = 92
+    # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+    #
+    # # t1 = time.time()
+    # # for i in df.index:
+    # #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    # #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
+    # #     df.loc[i, '投诉人'] = complainants
+    # #     df.loc[i, '被投诉人'] = punishPeople
+    # #     df.loc[i, '执法机构'] = institutions
+    # #     df.loc[i, '处罚时间'] = punishTimes
+    # #     df.loc[i, '处罚编号'] = punish_code
+    # #     print('完成第%d篇'%i)
+    # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
+    # # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
+    # # #    'institution', 'punishTime', 'ner_test']])
+    # # t2 = time.time()
+    # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    # # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
+    # # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
     # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
-    # #     '关键词', '类别', '处理决定', '投诉是否成立',
-    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
-    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
-    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
-    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
-    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
-    # t3 = time.time()
-    # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
-    s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
-    # list_sentences = [s.split('。')]
-    # punish_code= punish.predict_punishCode( list_sentences)
-    # print(punish_code)
-
-    # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
-    #             get_punish_extracts(text=s)
-    punish_dic = punish.get_punish_extracts_backup(text=s)
-    print(punish_dic)
+    # #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
+    # # t3 = time.time()
+    # # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
+    # s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
+    # # list_sentences = [s.split('。')]
+    # # punish_code= punish.predict_punishCode( list_sentences)
+    # # print(punish_code)
+    #
+    # # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    # #             get_punish_extracts(text=s)
+    # punish_dic = punish.get_punish_extracts_backup(text=s)
+    # print(punish_dic)

+ 244 - 215
iepy/selfpreprocess/BiddingKG/dl/interface/Preprocessing.py

@@ -107,7 +107,7 @@ def tableToText(soup):
             tr_line = []
             tds = tr.findChildren(['td','th'], recursive=False)
             for td in tds:
-                tr_line.append([re.sub('\xa0','',segment(td)),0])
+                tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
                 #tr_line.append([td.get_text(),0])
             inner_table.append(tr_line)
         return inner_table                          
@@ -419,10 +419,10 @@ def tableToText(soup):
                 inner_table[_h][_w][1] = 0
 
 
-        print("=====")
-        for item in inner_table:
-            print(item)
-        print("======")
+        # print("=====")
+        # for item in inner_table:
+        #     print(item)
+        # print("======")
 
         repairTable(inner_table)
         head_list = sliceTable(inner_table)
@@ -628,7 +628,7 @@ def tableToText(soup):
         # packPattern = "(标包|[标包][号段名])"
         packPattern = "(标包|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
         rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标)"  # 2020/11/23 大网站规则,添加序号为排序
-        entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
+        entityPattern = "(候选|([中投]标|报价)|单位名称|供应商|金额)"
         height = len(inner_table)
         width = len(inner_table[0])
         text = ""
@@ -640,11 +640,7 @@ def tableToText(soup):
                 
             direct = getDirect(inner_table, head_begin, head_end)
 
-            print("----")
-            print(inner_table[head_begin:head_end])
-            print("head_end-head_begin",head_end-head_begin)
-            print(direct)
-            
+
             #若只有一行,则直接按行读取
             if head_end-head_begin==1:
                 text_line = ""
@@ -671,38 +667,34 @@ def tableToText(soup):
                         line_oc.append({"text":cell[0],"type":cell[1],"occu_count":0,"left_head":"","top_head":""})
                     table_occurence.append(line_oc)
 
+
+                occu_height = len(table_occurence)
+                occu_width = len(table_occurence[0]) if len(table_occurence)>0 else 0
                 #为每个属性值寻找表头
-                for i in range(head_begin,head_end):
-                    pack_text = ""
-                    rank_text = ""
-                    entity_text = ""
-                    text_line = ""
-                    #在同一句话中重复的可以去掉
-                    text_set = set()
-                    for j in range(width):
+                for i in range(occu_height):
+                    for j in range(occu_width):
                         cell = table_occurence[i][j]
                         #是属性值
                         if cell["type"]==0 and cell["text"]!="":
                             left_head = ""
                             top_head = ""
 
-                            head = ""
-
                             find_flag = False
                             temp_head = ""
-                            for loop_i in range(0,i+1-head_begin):
+                            for loop_i in range(1,i+1):
                                 if not key_direct:
                                     key_values = [1,2]
                                 else:
                                     key_values = [1]
                                 if table_occurence[i-loop_i][j]["type"] in key_values:
                                     if find_flag:
-                                        if table_occurence[i-loop_i][j][0]!=temp_head:
+                                        if table_occurence[i-loop_i][j]["text"]!=temp_head:
                                             top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
                                     else:
                                         top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
                                     find_flag = True
                                     temp_head = table_occurence[i-loop_i][j]["text"]
+                                    table_occurence[i-loop_i][j]["occu_count"] += 1
                                 else:
                                     #找到表头后遇到属性值就返回
                                     if find_flag:
@@ -720,38 +712,21 @@ def tableToText(soup):
                                     key_values = [1,2]
                                 else:
                                     key_values = [2]
-                                if inner_table[i][j-loop_j][1] in key_values:
+                                if table_occurence[i][j-loop_j]["type"] in key_values:
                                     if find_flag:
-                                        if inner_table[i][j-loop_j][0]!=temp_head:
-                                            head = inner_table[i][j-loop_j][0]+":"+head
+                                        if table_occurence[i][j-loop_j]["text"]!=temp_head:
+                                            left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
                                     else:
-                                        head = inner_table[i][j-loop_j][0]+":"+head
+                                        left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
                                     find_flag = True
-                                    temp_head = inner_table[i][j-loop_j][0]
+                                    temp_head = table_occurence[i][j-loop_j]["text"]
+                                    table_occurence[i][j-loop_j]["occu_count"] += 1
                                 else:
                                     if find_flag:
                                         break
-
-                            if str(head+inner_table[i][j][0]) in text_set:
-                                continue
-                            if re.search(packPattern,head) is not None:
-                                pack_text += head+inner_table[i][j][0]+","
-                            elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
-                                #排名替换为同一种表达
-                                rank_text += head+inner_table[i][j][0]+","
-                                #print(rank_text)
-                            elif re.search(entityPattern,head) is not None:
-                                entity_text += head+inner_table[i][j][0]+","
-                                #print(entity_text)
-                            else:
-                                text_line += head+inner_table[i][j][0]+","
-                            text_set.add(str(head+inner_table[i][j][0]))
-                    text += pack_text+rank_text+entity_text+text_line
-                    text = text[:-1]+"。" if len(text)>0 else text
-
-
+                            cell["left_head"] += left_head
                 if direct=="row":
-                    for i in range(head_begin,head_end):
+                    for i in range(occu_height):
                         pack_text = ""
                         rank_text = ""
                         entity_text = ""
@@ -759,131 +734,196 @@ def tableToText(soup):
                         #在同一句话中重复的可以去掉
                         text_set = set()
                         for j in range(width):
-                            cell = inner_table[i][j]
-                            #是属性值
-                            if cell[1]==0 and cell[0]!="":
-                                head = ""
-                                
-                                find_flag = False
-                                temp_head = ""
-                                for loop_i in range(0,i+1-head_begin):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [1]
-                                    if inner_table[i-loop_i][j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i-loop_i][j][0]!=temp_head:
-                                                head = inner_table[i-loop_i][j][0]+":"+head
-                                        else:
-                                            head = inner_table[i-loop_i][j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i-loop_i][j][0]
-                                    else:
-                                        #找到表头后遇到属性值就返回
-                                        if find_flag:
-                                            break
+                            cell = table_occurence[i][j]
+                            if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
 
-                                find_flag = False
-                                temp_head = ""
-
-
-                                
-                                for loop_j in range(1,j+1):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [2]
-                                    if inner_table[i][j-loop_j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i][j-loop_j][0]!=temp_head:
-                                                head = inner_table[i][j-loop_j][0]+":"+head
-                                        else:
-                                            head = inner_table[i][j-loop_j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i][j-loop_j][0]
-                                    else:
-                                        if find_flag:
-                                            break
-                                
-                                if str(head+inner_table[i][j][0]) in text_set:
+                                cell = table_occurence[i][j]
+                                head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
+                                head += cell["left_head"]
+                                if str(head+cell["text"]) in text_set:
                                     continue
                                 if re.search(packPattern,head) is not None:
-                                    pack_text += head+inner_table[i][j][0]+","
+                                    pack_text += head+cell["text"]+","
                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
                                     #排名替换为同一种表达
-                                    rank_text += head+inner_table[i][j][0]+","
+                                    rank_text += head+cell["text"]+","
                                     #print(rank_text)
                                 elif re.search(entityPattern,head) is not None:
-                                    entity_text += head+inner_table[i][j][0]+","
+                                    entity_text += head+cell["text"]+","
                                     #print(entity_text)
                                 else:
-                                    text_line += head+inner_table[i][j][0]+","
-                                text_set.add(str(head+inner_table[i][j][0]))
+                                    text_line += head+cell["text"]+","
+                                text_set.add(str(head+cell["text"]))
+
                         text += pack_text+rank_text+entity_text+text_line
                         text = text[:-1]+"。" if len(text)>0 else text
+
                 else:
-                    for j in range(width):
-                    
+                    for j in range(occu_width):
+                        pack_text = ""
                         rank_text = ""
                         entity_text = ""
                         text_line = ""
                         text_set = set()
-                        for i in range(head_begin,head_end):
-                            cell = inner_table[i][j]
-                            #是属性值
-                            if cell[1]==0 and cell[0]!="":
-                                find_flag = False
-                                head = ""
-                                temp_head = ""
-                                
-                                for loop_j in range(1,j+1):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [2]
-                                    if inner_table[i][j-loop_j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i][j-loop_j][0]!=temp_head:
-                                                head = inner_table[i][j-loop_j][0]+":"+head
-                                        else:
-                                            head = inner_table[i][j-loop_j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i][j-loop_j][0]
-                                    else:
-                                        if find_flag:
-                                            break
-                                find_flag = False
-                                temp_head = ""
-                                for loop_i in range(0,i+1-head_begin):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [1]
-                                    if inner_table[i-loop_i][j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i-loop_i][j][0]!=temp_head:
-                                                head = inner_table[i-loop_i][j][0]+":"+head
-                                        else:
-                                            head = inner_table[i-loop_i][j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i-loop_i][j][0]
-                                    else:
-                                        if find_flag:
-                                            break
-                                if str(head+inner_table[i][j][0]) in text_set:
+                        for i in range(occu_height):
+                            cell = table_occurence[i][j]
+                            if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
+
+                                cell = table_occurence[i][j]
+                                head = (cell["left_head"]+"") if len(cell["left_head"])>0 else ""
+                                head += cell["top_head"]
+                                if str(head+cell["text"]) in text_set:
                                     continue
-                                if re.search(rankPattern,head) is not None:
-                                    rank_text += head+inner_table[i][j][0]+","
+                                if re.search(packPattern,head) is not None:
+                                    pack_text += head+cell["text"]+","
+                                elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
+                                    #排名替换为同一种表达
+                                    rank_text += head+cell["text"]+","
                                     #print(rank_text)
                                 elif re.search(entityPattern,head) is not None:
-                                    entity_text += head+inner_table[i][j][0]+","
+                                    entity_text += head+cell["text"]+","
                                     #print(entity_text)
                                 else:
-                                    text_line += head+inner_table[i][j][0]+","
-                                text_set.add(str(head+inner_table[i][j][0]))
-                        text += rank_text+entity_text+text_line
+                                    text_line += head+cell["text"]+","
+                                text_set.add(str(head+cell["text"]))
+                        text += pack_text+rank_text+entity_text+text_line
                         text = text[:-1]+"。" if len(text)>0 else text
+
+
+                # if direct=="row":
+                #     for i in range(head_begin,head_end):
+                #         pack_text = ""
+                #         rank_text = ""
+                #         entity_text = ""
+                #         text_line = ""
+                #         #在同一句话中重复的可以去掉
+                #         text_set = set()
+                #         for j in range(width):
+                #             cell = inner_table[i][j]
+                #             #是属性值
+                #             if cell[1]==0 and cell[0]!="":
+                #                 head = ""
+                #
+                #                 find_flag = False
+                #                 temp_head = ""
+                #                 for loop_i in range(0,i+1-head_begin):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [1]
+                #                     if inner_table[i-loop_i][j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i-loop_i][j][0]!=temp_head:
+                #                                 head = inner_table[i-loop_i][j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i-loop_i][j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i-loop_i][j][0]
+                #                     else:
+                #                         #找到表头后遇到属性值就返回
+                #                         if find_flag:
+                #                             break
+                #
+                #                 find_flag = False
+                #                 temp_head = ""
+                #
+                #
+                #
+                #                 for loop_j in range(1,j+1):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [2]
+                #                     if inner_table[i][j-loop_j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i][j-loop_j][0]!=temp_head:
+                #                                 head = inner_table[i][j-loop_j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i][j-loop_j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i][j-loop_j][0]
+                #                     else:
+                #                         if find_flag:
+                #                             break
+                #
+                #                 if str(head+inner_table[i][j][0]) in text_set:
+                #                     continue
+                #                 if re.search(packPattern,head) is not None:
+                #                     pack_text += head+inner_table[i][j][0]+","
+                #                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
+                #                     #排名替换为同一种表达
+                #                     rank_text += head+inner_table[i][j][0]+","
+                #                     #print(rank_text)
+                #                 elif re.search(entityPattern,head) is not None:
+                #                     entity_text += head+inner_table[i][j][0]+","
+                #                     #print(entity_text)
+                #                 else:
+                #                     text_line += head+inner_table[i][j][0]+","
+                #                 text_set.add(str(head+inner_table[i][j][0]))
+                #         text += pack_text+rank_text+entity_text+text_line
+                #         text = text[:-1]+"。" if len(text)>0 else text
+                # else:
+                #     for j in range(width):
+                #
+                #         rank_text = ""
+                #         entity_text = ""
+                #         text_line = ""
+                #         text_set = set()
+                #         for i in range(head_begin,head_end):
+                #             cell = inner_table[i][j]
+                #             #是属性值
+                #             if cell[1]==0 and cell[0]!="":
+                #                 find_flag = False
+                #                 head = ""
+                #                 temp_head = ""
+                #
+                #                 for loop_j in range(1,j+1):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [2]
+                #                     if inner_table[i][j-loop_j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i][j-loop_j][0]!=temp_head:
+                #                                 head = inner_table[i][j-loop_j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i][j-loop_j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i][j-loop_j][0]
+                #                     else:
+                #                         if find_flag:
+                #                             break
+                #                 find_flag = False
+                #                 temp_head = ""
+                #                 for loop_i in range(0,i+1-head_begin):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [1]
+                #                     if inner_table[i-loop_i][j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i-loop_i][j][0]!=temp_head:
+                #                                 head = inner_table[i-loop_i][j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i-loop_i][j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i-loop_i][j][0]
+                #                     else:
+                #                         if find_flag:
+                #                             break
+                #                 if str(head+inner_table[i][j][0]) in text_set:
+                #                     continue
+                #                 if re.search(rankPattern,head) is not None:
+                #                     rank_text += head+inner_table[i][j][0]+","
+                #                     #print(rank_text)
+                #                 elif re.search(entityPattern,head) is not None:
+                #                     entity_text += head+inner_table[i][j][0]+","
+                #                     #print(entity_text)
+                #                 else:
+                #                     text_line += head+inner_table[i][j][0]+","
+                #                 text_set.add(str(head+inner_table[i][j][0]))
+                #         text += rank_text+entity_text+text_line
+                #         text = text[:-1]+"。" if len(text)>0 else text
         return text
     
     def removeFix(inner_table,fix_value="~~"):
@@ -948,12 +988,13 @@ def tableToText(soup):
     # return list_innerTable
 
 #数据清洗
-def segment(soup):
+def segment(soup,final=True):
     # print("==")
     # print(soup)
     # print("====")
     #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
-    if soup.name=="td":
+    subspaceList = ["td",'a',"span","p"]
+    if soup.name in subspaceList:
         #判断有值叶子节点数
         _count = 0
         for child in soup.find_all(recursive=True):
@@ -966,27 +1007,26 @@ def segment(soup):
                 if '...' in soup.get_text() and (soup.get_text()[:-3]).strip() in soup.attrs['title']:
                     text = soup.attrs['title']
 
-            _list = []
-            for x in re.split("\s+",text):
-                if x.strip()!="":
-                    _list.append(len(x))
-            if len(_list)>0:
-                _minLength = min(_list)
-                if _minLength>2:
-                    _substr = ","
-                else:
-                    _substr = ""
-            else:
-                _substr = ""
-            text = _substr.join(re.split("(\s+)",text))
+            # _list = []
+            # for x in re.split("\s+",text):
+            #     if x.strip()!="":
+            #         _list.append(len(x))
+            # if len(_list)>0:
+            #     _minLength = min(_list)
+            #     if _minLength>2:
+            #         _substr = ","
+            #     else:
+            #         _substr = ""
+            # else:
+            #     _substr = ""
             text = text.replace("\r\n",",").replace("\n",",")
-            text = re.sub("^[,\s]*|[,\s]*$","",text)
+            text = re.sub("\s+","##space##",text)
             return text
     segList = ["title"]
     commaList = ["div","br","td","p"]
     #commaList = []
     spaceList = ["span"]
-    subspaceList = ["td",'a',"span","p"]
+
     tbodies = soup.find_all('tbody')
     if len(tbodies) == 0:
         tbodies = soup.find_all('table')
@@ -1000,8 +1040,8 @@ def segment(soup):
         # if child.name in subspaceList:
         #     child.insert_before("#subs"+str(child.name)+"#")
         #     child.insert_after("#sube"+str(child.name)+"#")
-        if child.name in spaceList:
-            child.insert_after(" ")
+        # if child.name in spaceList:
+        #     child.insert_after(" ")
     text = str(soup.get_text())
 
     #替换英文冒号为中文冒号
@@ -1012,67 +1052,56 @@ def segment(soup):
     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
     
          
-    #删除标签中的所有空格
-    for subs in subspaceList:
-        patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
-        while(True):
-            oneMatch = re.search(re.compile(patten),text)
-            if oneMatch is not None:
-                _match = oneMatch.group(1)
-                _minLength = min([len(x) for x in re.split("(\s*)",_match)])
-                if _minLength>2:
-                    _substr = ","
-                else:
-                    _substr = ""
-                text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s",_substr,oneMatch.group(1)))
-            else:
-                break
-    
-    
+
     #替换"""为"“",否则导入deepdive出错
     text = text.replace('"',"“").replace("\r","").replace("\n",",")
     text = re.sub("\s{4,}",",",text)   
     #替换标点
-    while(True):
-        #替换连续的标点
-        punc = re.search(",(?P<punc>:|。|,|;)\s*",text)
-        if punc is not None:
-            text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text)
+
+    #替换连续的标点
+
+    punc_pattern = "(?P<del>[。,;::,\s]+)"
+
+    list_punc = re.findall(punc_pattern,text)
+    list_punc.sort(key=lambda x:len(x),reverse=True)
+    for punc_del in list_punc:
+        if len(punc_del)>1:
+            text = re.sub(punc_del,punc_del[-1],text)
         
-        punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
-        if punc is not None:
-            text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
-        else:
-            #替换标点之后的空格
-            punc = re.search("(?P<punc>:|。|,|;)\s+",text)
-            if punc is not None:
-                text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
-            else:
-                break
+
     #将连续的中文句号替换为一个
     text_split = text.split("。")
     text_split = [x for x in text_split if len(x)>0]
-    list_text = []
-    # for _t in text_split:
-    #     list_text.append(re.sub(")",")",re.sub("(","(",re.sub("\s*","",_t))))
     text = "。".join(text_split)
-    # text = text.replace(')',")").replace("(","(").replace("\s","")
-    #删除所有空格
+
+    # #删除标签中的所有空格
+    # for subs in subspaceList:
+    #     patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
+    #     while(True):
+    #         oneMatch = re.search(re.compile(patten),text)
+    #         if oneMatch is not None:
+    #             _match = oneMatch.group(1)
+    #             text = text.replace("#subs"+str(subs)+"#"+_match+"#sube"+str(subs)+"#",_match)
+    #         else:
+    #             break
+
     # text过大报错
     LOOP_LEN = 10000
     LOOP_BEGIN = 0
     _text = ""
+
+
+
     if len(text)<10000000:
         while(LOOP_BEGIN<len(text)):
-            _text += re.sub(")",")",re.sub("(","(",re.sub("\s*","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
+            _text += re.sub(")",")",re.sub("(","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
             LOOP_BEGIN += LOOP_LEN
-    else:
-        return text
-    # text = re.sub("\s*","",text)
-    # #替换中文括号为英文括号
-    # text = re.sub("(","(",text)
-    # text = re.sub(")",")",text)
-    return _text
+        text = _text
+
+    if final:
+        text = re.sub("##space##"," ",text)
+
+    return text
 
 '''
 #数据清洗

+ 3 - 0
iepy/selfpreprocess/BiddingKG/dl/interface/getAttributes.py

@@ -1177,6 +1177,9 @@ def getOtherAttributes(list_entity):
             dict_other["time_bidclose"] = timeFormat(entity.entity_text)
         elif entity.entity_type=="person" and entity.label ==4:
             dict_other["person_review"].append(entity.entity_text)
+        elif entity.entity_type=='product':
+            dict_other["product"].append(entity.entity_text)
+    dict_other["product"] = list(set(dict_other["product"]))
     return dict_other
 
 

+ 94 - 8
iepy/selfpreprocess/BiddingKG/dl/interface/predictor.py

@@ -16,6 +16,8 @@ sys.path.append(os.path.abspath("../.."))
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.modelFactory import *
 import tensorflow as tf
+from tensorflow.python.framework import graph_util
+from BiddingKG.dl.product.data_util import decode, process_data
 from BiddingKG.dl.interface.Entitys import Entity
 
 from threading import RLock
@@ -223,7 +225,7 @@ class CodeNamePredict():
             list_entitys = [[] for _ in range(len(list_sentences))]
         for list_sentence,list_entity in zip(list_sentences,list_entitys):
             if len(list_sentence)==0:
-                result.append([list_sentence[0].doc_id,{"code":[],"name":""}])
+                result.append([{"code":[],"name":""}])
                 continue
             doc_id = list_sentence[0].doc_id
             # sentences = []
@@ -522,7 +524,9 @@ class PREMPredict():
         data_x = []
         points_entitys = []
         for list_entity,list_sentence in zip(list_entitys,list_sentences):
-            
+
+            list_entity.sort(key=lambda x:x.sentence_index)
+            list_sentence.sort(key=lambda x:x.sentence_index)
             p_entitys = 0
             p_sentences = 0
             while(p_entitys<len(list_entity)):
@@ -557,7 +561,9 @@ class PREMPredict():
         data_x = []
         points_entitys = []
         for list_entity,list_sentence in zip(list_entitys,list_sentences):
-            
+
+            list_entity.sort(key=lambda x:x.sentence_index)
+            list_sentence.sort(key=lambda x:x.sentence_index)
             p_entitys = 0
     
             while(p_entitys<len(list_entity)):
@@ -583,10 +589,12 @@ class PREMPredict():
     
     def predict_role(self,list_sentences, list_entitys):
         datas = self.search_role_data(list_sentences, list_entitys)
+
         if datas is None:
             return
         points_entitys = datas[1]
-        
+
+
         if USE_PAI_EAS:
             _data = datas[0]
             _data = np.transpose(np.array(_data),(1,0,2))
@@ -1134,7 +1142,7 @@ class TimePredictor():
         self.sess = tf.Session(graph=tf.Graph())
         self.inputs_code = None
         self.outputs_code = None
-        self.input_shape = (2,30,60)
+        self.input_shape = (2,10,128)
         self.load_model()
 
     def load_model(self):
@@ -1168,10 +1176,13 @@ class TimePredictor():
                     while(p_sentences<len(list_sentence)):
                         sentence = list_sentence[p_sentences]
                         if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
-                            left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
-                            right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
+                            # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
+                            # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
+                            s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
+                            left = s[0]
+                            right = s[1]
                             context = [left, right]
-                            x = embedding_word(context, shape=self.input_shape)
+                            x = embedding(context, shape=self.input_shape)
                             data_x.append(x)
                             points_entitys.append(entity)
                             break
@@ -1198,6 +1209,80 @@ class TimePredictor():
                     values.append(item)
                     entity.set_Role(label, values)
 
+# 产品字段提取
+class ProductPredictor():
+    def __init__(self):
+        self.sess = tf.Session(graph=tf.Graph())
+        self.load_model()
+
+    def load_model(self):
+        model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                output_graph_def = tf.GraphDef()
+                with open(model_path, 'rb') as f:
+                    output_graph_def.ParseFromString(f.read())
+                    tf.import_graph_def(output_graph_def, name='')
+                    self.sess.run(tf.global_variables_initializer())
+                    self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
+                    self.length = self.sess.graph.get_tensor_by_name("Sum:0")
+                    self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
+                    self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
+                    self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
+
+    def predict(self, list_sentences,list_entitys=None, MAX_AREA=5000):
+        '''
+        预测实体代码,每个句子最多取MAX_AREA个字,超过截断
+        :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
+        :param list_entitys: 多篇公告实体列表
+        :param MAX_AREA: 每个句子最多截取多少字
+        :return: 把预测出来的实体放进实体类
+        '''
+        with self.sess.as_default() as sess:
+            with self.sess.graph.as_default():
+                result = []
+                if list_entitys is None:
+                    list_entitys = [[] for _ in range(len(list_sentences))]
+                for list_sentence, list_entity in zip(list_sentences,list_entitys):
+                    if len(list_sentence)==0:
+                        result.append({"product":[]})
+                        continue
+                    list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
+                    _begin_index = 0
+                    item = {"product":[]}
+                    temp_list = []
+                    while True:
+                        MAX_LEN = len(list_sentence[_begin_index].sentence_text)
+                        if MAX_LEN > MAX_AREA:
+                            MAX_LEN = MAX_AREA
+                        _LEN = MAX_AREA//MAX_LEN
+                        chars = process_data([sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]])
+                        lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
+                                                          feed_dict={
+                                                                    self.char_input: np.asarray(chars),
+                                                                    self.dropout: 1.0
+                                                                    })
+                        batch_paths = decode(scores, lengths, tran_)
+                        for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
+                            tags = ''.join([str(it) for it in path[:length]])
+                            for it in re.finditer("12*3", tags):
+                                start = it.start()
+                                end = it.end()
+                                _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
+                                sentence.doc_id, sentence.sentence_index, start, end),
+                                                 entity_text=sentence.sentence_text[start:end],
+                                                 entity_type="product", sentence_index=sentence.sentence_index,
+                                                 begin_index=0, end_index=0, wordOffset_begin=start,
+                                                 wordOffset_end=end)
+                                list_entity.append(_entity)
+                                temp_list.append(sentence.sentence_text[start:end])
+                        item["product"] = list(set(temp_list))
+                        result.append(item)
+                        if _begin_index+_LEN >= len(list_sentence):
+                            break
+                        _begin_index += _LEN
+                return result
+
 def getSavedModel():
     #predictor = FormPredictor()
     graph = tf.Graph()
@@ -1559,6 +1644,7 @@ def save_timesplit_model():
                                                "input1":time_model.input[1]},
                                        outputs={"outputs":time_model.output})
 
+
 if __name__=="__main__":
     #save_role_model()
     # save_codename_model()

二進制
iepy/selfpreprocess/BiddingKG/dl/interface/product_savedmodel/product.pb


二進制
iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/saved_model.pb


二進制
iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001


二進制
iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/variables/variables.index


+ 0 - 0
iepy/selfpreprocess/BiddingKG/dl/product/__init__.py


+ 155 - 0
iepy/selfpreprocess/BiddingKG/dl/product/data_util.py

@@ -0,0 +1,155 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/13 0013 14:19
+import re
+import math
+import random
+import psycopg2
+import numpy as np
+from tensorflow.contrib.crf import viterbi_decode
+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word
+
+id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
+word_model = getModel_word()
+vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
+word2id = {k: v for v, k in enumerate(vocab)}
+max_id = len(vocab)
+conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.101')
+cursor = conn.cursor()
+
+def get_label_data():
+    sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 \
+      and creation_date > to_timestamp('2021-01-14 00:00:00','yyyy-MM-dd HH24:mi:ss');"
+    cursor.execute(sql)
+    writer = open('label_data.txt', 'w', encoding='utf-8')
+    datas = []
+    for row in cursor.fetchall():
+        docid = row[0]
+        text = row[1]
+        # string = list(text)
+        tags = [0]*len(text)
+        sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
+        cursor.execute(sql_lb)
+        for row_lb in cursor.fetchall():
+            label = row_lb[0]
+            _, _, begin, end, _ = re.split('\s',label)
+            begin = int(begin)
+            end = int(end)
+            if end-begin>=2:
+                tags[begin]=1
+                tags[end-1]=3
+                for i in range(begin+1,end-1):
+                    tags[i]=2
+        # datas.append([string, tags])
+        text_sentence = []
+        ids_sentence = []
+        tag_sentence = []
+        for i in range(len(text)):
+            text_sentence.append(text[i])
+            ids_sentence.append(word2id.get(text[i], max_id))
+            tag_sentence.append(tags[i])
+            writer.write("%s\t%s\n"%(text[i],tags[i]))
+            if text[i] in ['。','?','!',';']:
+                writer.write('\n')
+                if text_sentence:
+                    if len(text_sentence) > 100:
+                    # if len(text_sentence)>5 and len(text_sentence)<1000:
+                        datas.append([text_sentence, ids_sentence,tag_sentence])
+                    elif len(text_sentence) > 5:
+                        continue
+                    else:
+                        print('单句小于5或大于100,句子长度为:%d,文章ID:%s'%(len(text_sentence), docid))
+                    text_sentence = []
+                    ids_sentence = []
+                    tag_sentence = []
+        if text_sentence:
+            if len(text_sentence) > 5:
+            # if len(text_sentence) > 5 and len(text_sentence) < 1000:
+                datas.append([text_sentence, ids_sentence, tag_sentence])
+            else:
+                print('单句小于5或大于100,句子长度为:%d,文章ID:%s' % (len(text_sentence), docid))
+    writer.close()
+    return datas
+
+def input_from_line(line):
+    string = list(line)
+    ids = [word2id.get(k, max_id) for k in string]
+    tags = []
+    return [[string], [ids], [tags]]
+def process_data(sentences):
+    '''
+    字符串数字化并统一长度
+    :param sentences: 文章分句字符串列表['招标公告','招标代理']
+    :return: 数字化后的统一长度
+    '''
+    maxLen = max([len(sentence) for sentence in sentences])
+    tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
+    pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
+    return pad_tags
+
+def get_ner(BIE_tag):
+    ner = set()
+    for it in re.finditer('BI*E',BIE_tag):
+        ner.add((it.start(),it.end()))
+    return ner
+
+def decode(logits, lengths, matrix):
+    paths = []
+    small = -1000.0
+    start = np.asarray([[small]*4+[0]])
+    for score, length in zip(logits, lengths):
+        score = score[:length]
+        pad = small * np.ones([length, 1])
+        logits = np.concatenate([score, pad], axis=1)
+        logits = np.concatenate([start, logits], axis=0)
+        path, _  = viterbi_decode(logits, matrix)
+        paths.append(path[1:])
+    return paths
+
+def result_to_json(line, tags):
+    result = []
+    ner = []
+    tags = ''.join([str(it) for it in tags])
+    for it in re.finditer("12*3", tags):
+        start = it.start()
+        end = it.end()
+        ner.append([line[start:end], (start, end)])
+    result.append([line, ner])
+    print(tags)
+    return result
+
+
+class BatchManager(object):
+    def __init__(self, data, batch_size):
+        self.batch_data = self.sort_and_pad(data, batch_size)
+        self.len_data = len(self.batch_data)
+
+    def sort_and_pad(self, data, batch_size):
+        num_batch = int(math.ceil(len(data)/batch_size))
+        sorted_data = sorted(data, key=lambda x:len(x[0]))
+        print('最小句子长度:%d;最大句子长度:%d' % (len(sorted_data[0][0]), len(sorted_data[-1][0])))  # 临时增加打印句子长度
+        batch_data = list()
+        for i in range(num_batch):
+            batch_data.append(self.pad_data(sorted_data[i*int(batch_size):(i+1)*int(batch_size)]))
+        return batch_data
+
+    @staticmethod
+    def pad_data(data):
+        strings = []
+        chars = []
+        targets = []
+        max_length = max([len(sentence[0]) for sentence in data])
+        for line in data:
+            string, char, target = line
+            padding = [0]*(max_length-len(string))
+            strings.append(string + padding)
+            chars.append(char + padding)
+            targets.append(target + padding)
+        return [strings, chars, targets]
+
+    def iter_batch(self, shuffle=False):
+        if shuffle:
+            random.shuffle(self.batch_data)
+        for idx in range(self.len_data):
+            yield self.batch_data[idx]

+ 68 - 4
iepy/selfpreprocess/self_preprocess.py

@@ -18,31 +18,53 @@ import iepy.selfpreprocess.BiddingKG.dl.interface.getAttributes as getAttributes
 import iepy.selfpreprocess.BiddingKG.dl.entityLink.entityLink as entityLink
 import json
 from iepy.selfpreprocess.pipeline import PreProcessSteps
+import iepy.selfpreprocess.BiddingKG.dl.complaint.punish_predictor as punish_rule
 
 from iepy.webui.brat.src import annotator
 
 
 logger = logging.getLogger(__name__)
 
+
 codeNamePredict = predictor.CodeNamePredict()
 premPredict = predictor.PREMPredict()
 epcPredict = predictor.EPCPredict()
 roleRulePredict = predictor.RoleRulePredictor()
 timePredict = predictor.TimePredictor()
+punish = punish_rule.Punish_Extract()
+productPredict = predictor.ProductPredictor()
 
 def predict(doc_id,text):
 
     log("process %s"%doc_id)
+
     list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
 
     codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
+    print(codeName)
     premPredict.predict(list_sentences,list_entitys)
+    productPredict.predict(list_sentences,list_entitys)
     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    print("epcPredict")
     epcPredict.predict(list_sentences,list_entitys)
+    print("entityLink")
     timePredict.predict(list_sentences, list_entitys)
+    print("timePredict")
     entityLink.link_entitys(list_entitys)
-    _prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
-    log("extract done %s"%(str(_prem)))
+    print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
+
+
+    # codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
+    # productPredict.predict(list_sentences,list_entitys)
+    # premPredict.predict(list_sentences,list_entitys)
+    # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    # epcPredict.predict(list_sentences,list_entitys)
+    # timePredict.predict(list_sentences, list_entitys)
+    # entityLink.link_entitys(list_entitys)
+    # prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
+    #
+    log("extract done %s"%(str(prem)))
     return list_articles,list_sentences,list_entitys
 
 
@@ -135,6 +157,7 @@ class SelfAnalizer():
         dict_sentences = dict()
         offset_word = 0
         offset_words = 0
+        self.sentences.sort(key=lambda x:x.sentence_index)
         for sentence in self.sentences:
             # print(len(sentence.sentence_text),sentence.sentence_text)
             if sentence.sentence_index not in dict_sentences:
@@ -231,39 +254,78 @@ class SelfAnalizer():
 
     def generate_spans_relations(self):
         print("%s entity length:%d"%(self.docid,len(self.entitys)))
+        list_pre_label = []
         for _entity in self.entitys:
             doc_id = _entity.doc_id
             offset = [[self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin,self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end]]
             _type = getType(_entity)
             ann_id = annotator.create_span_interface(document=doc_id,offsets=offset,_type=_type)
             _entity.ann_id = ann_id
+            _label = "T|%s|%d|%d"%(_type,offset[0][0],offset[0][1])
+            list_pre_label.append(_label)
         for _entity in self.entitys:
             if _entity.pointer_pack is not None:
                 origin = _entity.ann_id
                 target = _entity.pointer_pack.ann_id
                 _type = dict_relations["pointer_pack"]
                 annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
+                origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
+                origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
+                p_target = _entity.pointer_pack
+                target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
+                target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
+                _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
+                list_pre_label.append(_label)
             if _entity.pointer_money is not None:
                 origin = _entity.ann_id
                 target = _entity.pointer_money.ann_id
                 # print("$$$$$$$$",_entity.pointer_money.entity_text)
                 _type = dict_relations["pointer_money"]
                 annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
+                origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
+                origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
+                p_target = _entity.pointer_money
+                target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
+                target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
+                _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
+                list_pre_label.append(_label)
             if _entity.pointer_person is not None:
                 origin = _entity.ann_id
                 target = _entity.pointer_person.ann_id
                 _type = dict_relations["pointer_person"]
                 annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
+                origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
+                origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
+                p_target = _entity.pointer_person
+                target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
+                target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
+                _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
+                list_pre_label.append(_label)
             if _entity.pointer_address is not None:
                 origin = _entity.ann_id
                 target = _entity.pointer_address.ann_id
                 _type = dict_relations["pointer_address"]
                 annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
+                origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
+                origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
+                p_target = _entity.pointer_address
+                target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
+                target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
+                _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
+                list_pre_label.append(_label)
             if _entity.pointer_tendereeMoney is not None:
                 origin = _entity.ann_id
                 target = _entity.pointer_tendereeMoney.ann_id
                 _type = dict_relations["pointer_tendereeMoney"]
                 annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
+                origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
+                origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
+                p_target = _entity.pointer_tendereeMoney
+                target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
+                target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
+                _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
+                list_pre_label.append(_label)
+        return list_pre_label
 
 
 
@@ -282,7 +344,7 @@ class SelfPreprocesser(BasePreProcessStepRunner):
     def run_everything(self,document):
         analysis = SelfAnalizer(document.human_identifier,document.sourcetext)
         # Tokenization
-        if len(analysis.entitys)>5 and len(analysis.entitys)<500:
+        if len(analysis.entitys)>5 and len(analysis.entitys)<300:
             document.text = analysis.article.content
             tokens = analysis.get_tokens()
             offsets = analysis.get_token_offsets()
@@ -310,7 +372,8 @@ class SelfPreprocesser(BasePreProcessStepRunner):
 
             # Save progress so far, next step doesn't modify `document`
             document.save()
-            analysis.generate_spans_relations()
+            list_pre_label = analysis.generate_spans_relations()
+            document.pre_label = ';'.join(list_pre_label)
             document.brat_done_at = datetime.datetime.now()
             document.save()
         else:
@@ -324,3 +387,4 @@ if __name__=="__main__":
 
 
 
+

文件差異過大導致無法顯示
+ 159 - 0
iepy/selfpreprocess/test4.py


+ 1 - 1
iepy/webui/brat/models.py

@@ -13,7 +13,7 @@ class BaseModel(models.Model):
 
 class BratAnnotation(BaseModel):
 
-    document_id = models.CharField(max_length=CHAR_LENGTH)
+    document_id = models.CharField(max_length=CHAR_LENGTH,db_index=True)
 
     value = models.CharField(max_length=CHAR_LENGTH*3)
 

+ 83 - 14
iepy/webui/brat/src/annotation.py

@@ -245,18 +245,61 @@ def get_annotations_db(filename):
         list_ann.append(ann.value)
     return "\n".join(list_ann)
 
+import re
+from datetime import datetime
+T_pattern = re.compile('(?P<T>T\d+)\t(?P<type>[^\s]+) (?P<begin>\d+) (?P<end>\d+)')
+R_pattern = re.compile("(?P<R>R\d+)\t(?P<type>[^\s]+)[^:]*:(?P<arg1>T\d)[^:]*:(?P<arg2>T\d+)")
+
+import traceback
+
 def save_annotations_db(filename,str_ann):
     #save the annotations to db instead of textfile
     try:
+        pre_label = IEDocument.objects.filter(human_identifier=filename).values("pre_label")
+        _label = ""
+        if len(pre_label)>0:
+            if pre_label[0]["pre_label"] is not None:
+                _label = pre_label[0]["pre_label"]
+        dict_T = dict()
+        dict_R = dict()
+        set_pre_label = set()
+        for _i in _label.split(";"):
+            if _i!="":
+                set_pre_label.add(_i)
+        set_label = set()
         with transaction.atomic():
             brat_annotations.objects.filter(document_id=filename).delete()
+            list_anno = []
             for _str in str_ann.split("\n"):
-                # print("======",_str,"=====")
                 _str = _str.strip()
                 if _str != "":
                     ann = brat_annotations(document_id=filename,value=_str)
-                    ann.save()
+                    list_anno.append(ann)
+
+                    if _str[0]=="T":
+                        match = re.search(T_pattern,_str)
+                        if match is not None:
+                            match = match.groupdict()
+                            dict_T[match["T"]] = {"type":match["type"],"begin":match["begin"],"end":match["end"]}
+                    if _str[0]=="R":
+                        match = re.search(R_pattern,_str)
+                        if match is not None:
+                            match = match.groupdict()
+                            dict_R[match["R"]] = {"type":match["type"],"arg1":match["arg1"],"arg2":match["arg2"]}
+            if len(list_anno)>0:
+                brat_annotations.objects.bulk_create(list_anno)
+            for _T,_v in dict_T.items():
+                set_label.add("T|%s|%d|%d"%(_v["type"],int(_v["begin"]),int(_v["end"])))
+            for _R,_v in dict_R.items():
+                set_label.add("R|%s|%d|%d|%d|%d"%(_v["type"],int(dict_T[_v["arg1"]]["begin"]),int(dict_T[_v["arg1"]]["end"]),int(dict_T[_v["arg2"]]["begin"]),int(dict_T[_v["arg2"]]["end"])))
+            union_set = set_pre_label&set_label
+            deleted = len(set_pre_label)-len(union_set)
+            added = len(set_label)-len(union_set)
+            IEDocument.objects.filter(human_identifier=filename).update(reedittime=datetime.now(),deleted=deleted,added=added)
+
+
     except Exception as e:
+        traceback.print_exc()
         logger.warn("document %s save error"%filename)
 
 def __split_annotation_id(id):
@@ -1257,7 +1300,8 @@ class TextAnnotations(Annotations):
             #     return f.read()
             #change read ways to db
             _ieDoc = IEDocument.objects.filter(human_identifier=document)
-            return _ieDoc.get().text
+            logger.warn(document)
+            return _ieDoc.first().text
         except IOError:
             Messager.error('Error reading document text from %s' % textfn)
         raise AnnotationTextFileNotFoundError(document)
@@ -1749,14 +1793,39 @@ class BinaryRelationAnnotation(IdedAnnotation):
 
 
 if __name__ == '__main__':
-    from sys import stderr, argv
-    for ann_path_i, ann_path in enumerate(argv[1:]):
-        print >> stderr, ("%s.) '%s' " % (ann_path_i, ann_path, )
-                          ).ljust(80, '#')
-        try:
-            with Annotations(ann_path) as anns:
-                for ann in anns:
-                    print >> stderr, str(ann).rstrip('\n')
-        except ImportError:
-            # Will try to load the config, probably not available
-            pass
+    filename = '74800260'
+    pre_label = IEDocument.objects.filter(human_identifier='74800260').values("pre_label")
+    _label = ""
+    if len(pre_label)>0:
+        if pre_label[0]["pre_label"] is not None:
+            _label = pre_label[0]["pre_label"]
+    dict_T = dict()
+    dict_R = dict()
+    set_pre_label = set()
+    for _i in _label.split(";"):
+        if _i!="":
+            set_pre_label.add(_i)
+    set_label = set()
+    anns = brat_annotations.objects.filter(document_id='74800260').values("value")
+    for _str in anns:
+        _str = _str["value"].strip()
+        if _str != "":
+            if _str[0]=="T":
+                match = re.search(T_pattern,_str)
+                if match is not None:
+                    match = match.groupdict()
+                    dict_T[match["T"]] = {"type":match["type"],"begin":match["begin"],"end":match["end"]}
+            if _str[0]=="R":
+                match = re.search(R_pattern,_str)
+                if match is not None:
+                    match = match.groupdict()
+                    dict_R[match["R"]] = {"type":match["type"],"arg1":match["arg1"],"arg2":match["arg2"]}
+    for _T,_v in dict_T.items():
+        set_label.add("T|%s|%d|%d"%(_v["type"],int(_v["begin"]),int(_v["end"])))
+    for _R,_v in dict_R.items():
+        set_label.add("R|%s|%d|%d|%d|%d"%(_v["type"],int(dict_T[_v["arg1"]]["begin"]),int(dict_T[_v["arg1"]]["end"]),int(dict_T[_v["arg2"]]["begin"]),int(dict_T[_v["arg2"]]["end"])))
+    union_set = set_pre_label&set_label
+    deleted = len(set_pre_label)-len(union_set)
+    added = len(set_label)-len(union_set)
+    print(deleted,added)
+    # IEDocument.objects.filter(human_identifier=filename).update(reedittime=datetime.now(),deleted=deleted,added=added)

+ 0 - 3
iepy/webui/brat/src/config.py

@@ -34,9 +34,6 @@ BASE_DIR = os.path.dirname(__file__)
 # WORK_DIR = path_join(BASE_DIR, '/work')
 DATA_DIR = BASE_DIR+"/data"
 WORK_DIR = BASE_DIR+"/work"
-print("%%")
-print(BASE_DIR)
-print(WORK_DIR)
 # If you have installed brat as suggested in the installation
 # instructions, you can set up BASE_DIR, DATA_DIR and WORK_DIR by
 # removing the three lines above and deleting the initial '#'

+ 3 - 1
iepy/webui/brat/src/dispatch.py

@@ -24,7 +24,7 @@ from convert.convert import convert
 from delete import delete_collection, delete_document
 from docimport import save_import
 from document import (get_configuration, get_directory_information,
-                      get_document, get_document_timestamp,moveDocument,searchLabel)
+                      get_document, get_document_timestamp,moveDocument,searchLabel,getChangedOfDocument,getSelfLabel)
 from download import download_collection, download_file
 from jsonwrap import dumps
 from message import Messager
@@ -50,8 +50,10 @@ def logging_no_op(collection, document, log):
 DISPATCHER = {
     'getCollectionInformation': get_directory_information,
     'getDocument': get_document,
+    'getChangedOfDocument':getChangedOfDocument,
     'moveDocument':moveDocument,
     'searchLabel':searchLabel,
+    'getSelfLabel':getSelfLabel,
     'getDocumentTimestamp': get_document_timestamp,
     'importDocument': save_import,
 

+ 96 - 22
iepy/webui/brat/src/document.py

@@ -45,7 +45,53 @@ from stats import get_statistics
 from iepy.webui.corpus.models import IEDocument
 from django.db.models import Q
 from threading import RLock
-from django.db import connection
+from django.db import connection,transaction
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+def getSelfLabel(time_begin,time_end,page_num,request):
+    _limit = 20
+    if page_num=="":
+        page_num = 1
+    elif int(page_num)<=0:
+        page_num = 1
+    _offset = _limit*(int(page_num)-1)
+
+    if time_begin!="":
+        and_begin = " to_char(edittime,'yyyy-mm-dd')>='%s'" %(time_begin)
+    else:
+        and_begin = " 1=1 "
+    if time_end!="":
+        and_end = " to_char(edittime,'yyyy-mm-dd')<='%s' "%(time_end)
+    else:
+        and_end = " 1=1 "
+    user = request.user.username
+    cursor = connection.cursor()
+    where_sql = " where edituser='%s' and %s and %s "%(user,and_begin,and_end)
+    sql = "select count(1) from corpus_iedocument %s"%where_sql
+    logger.warn(sql)
+    cursor.execute(sql)
+    all_num = cursor.fetchall()[0][0]
+    num_page = all_num//_limit
+    if all_num%_limit!=0:
+        num_page += 1
+    list_result = []
+    where_sql = " where edituser='%s' and %s and %s offset %d limit %d "%(user,and_begin,and_end,_offset,_limit)
+    sql = " select row_number() over(order by edittime desc) as id,human_identifier as document_id,to_char(edittime,'yyyy-mm-dd') as edittime,deleted,added from corpus_iedocument %s"%where_sql
+    logger.warn(sql)
+    cursor.execute(sql)
+    vol = cursor.description
+    rows = cursor.fetchall()
+    for row in rows:
+        _dict = dict()
+        for _n,_v in zip(vol,row):
+            _name = _n[0]
+            _dict[_name] = _v
+        list_result.append(_dict)
+
+    return {"rowsData":list_result,"all_num":all_num,"num_page":num_page,"page_num":page_num}
 
 def _fill_type_configuration(
         nodes,
@@ -936,64 +982,84 @@ def get_document(collection, document):
     # doc_path = path_join(real_dir, document)
     return _document_json_dict(document)
 
+def getChangedOfDocument(document):
+    docs = IEDocument.objects.filter(Q(human_identifier=document)).values('deleted','added')
+    _dict = {"deleted":0,"added":0}
+    if len(docs)>0:
+        _dict = docs[0]
+    return _dict
+
 doc_lock = RLock()
 
 def moveDocument(dir,document,request):
     user = request.user.username
     if int(dir)>0:
         #get the next of the user edit before
-        docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) and Q(edituser=user)).order_by("-edittime")
+        docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) and Q(edituser=user)).order_by("-edittime").values('id','human_identifier')
         list_docs = list(docs)
         _index = None
         for i in range(len(list_docs)):
             _doc = list_docs[i]
-            if _doc.human_identifier==document:
+            if _doc['human_identifier']==document:
                 _index = i
                 break
         if _index is None or _index==0:
             #next
             #use lock to gain the next document
             with doc_lock:
-                docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) & Q(edituser__isnull=True) & Q(jump_signal=0)).order_by("id")
+                docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) & Q(edituser__isnull=True) & Q(jump_signal=0)).order_by("id").values('id','human_identifier')
                 if docs.count()>0:
-                    _doc = docs.first()
-                    new_document = _doc.human_identifier
-                    _doc.edittime = datetime.now()
-                    _doc.edituser = user
-                    _doc.save()
+                    update_flag = False
+                    for _doc in docs:
+                        _doc = docs.first()
+                        new_document = _doc['human_identifier']
+
+                        with transaction.atomic():
+                            updated = IEDocument.objects.filter(
+                                id=_doc["id"],
+                                edituser__isnull=True
+                            ).update(
+                                edituser=user,
+                                edittime=datetime.now()
+                            )
+                            if updated==0:
+                                transaction.rollback
+                            else:
+                                update_flag = True
+                                break
+                    if not update_flag:
+                        new_document = document
+                        Messager.info("数据标注完成")
                 else:
                     new_document = document
                     Messager.info("数据标注完成")
         else:
             _doc = list_docs[_index-1]
-            _doc.reedittime = datetime.now()
-            new_document = _doc.human_identifier
+            new_document = _doc['human_identifier']
 
     else:
         #prev
         #get the prev document of the current user
-        docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) and Q(edituser=user) ).order_by("-edittime")
+        docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) and Q(edituser=user) ).order_by("-edittime").values('id','human_identifier')
         if docs.count()>1:
             list_docs = list(docs)
             _index = None
             for i in range(len(list_docs)):
                 _doc = list_docs[i]
-                if _doc.human_identifier==document:
+                if _doc['human_identifier']==document:
                     _index = i
-                    _doc.reedittime = datetime.now()
-                    _doc.save()
                     break
             if _index is not None:
                 if  _index<len(list_docs)-1:
-                    new_document = list_docs[_index+1].human_identifier
+                    new_document = list_docs[_index+1]["human_identifier"]
                 else:
-                    new_document = list_docs[-1].human_identifier
+                    new_document = list_docs[-1]["human_identifier"]
                     Messager.info("已经是第一条数据")
             else:
                 if document=="":
-                    new_document = list_docs[0].human_identifier
+                    new_document = list_docs[0]["human_identifier"]
                 else:
-                    new_document = list_docs[1].human_identifier
+                    new_document = list_docs[1]["human_identifier"]
 
         else:
             new_document = ""
@@ -1002,13 +1068,21 @@ def moveDocument(dir,document,request):
 def searchLabel(request):
     user = request.user.username
     cursor = connection.cursor()
-    sql = " select human_identifier from corpus_iedocument where edituser='%s'order by edittime asc" %(user)
+    sql = 'select max(end_time) from corpus_payroll where "user"=\'%s\''%(user)
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    end_time = rows[0][0]
+    and_sql = " and 1=1 "
+    if end_time is not None:
+        and_sql = " and to_char(edittime,'yyyy-mm-dd')>'%s' "%end_time
+
+    sql = " select human_identifier from corpus_iedocument where edituser='%s' %s order by edittime asc" %(user,and_sql)
     cursor.execute(sql)
     list_docid = []
     for row in cursor.fetchall():
         list_docid.append(row[0])
     set_first = set(list_docid[:1200])
-    sql = " select document_id,value from brat_bratannotation where document_id in(select human_identifier from corpus_iedocument where edituser='%s') "%(user)
+    sql = " select document_id,value from brat_bratannotation where document_id in(select human_identifier from corpus_iedocument where edituser='%s' %s) "%(user,and_sql)
     cursor.execute(sql)
     eleCount = 0
     relCount = 0
@@ -1033,7 +1107,7 @@ def searchLabel(request):
     return {"docCount":len(list_docid),"eleCount":eleCount,"relCount":relCount,"wage":round(wage*0.9,2)}
 
 
-def get_document_timestamp(collection, document):
+def get_document_timestamp(collection, document,request):
     directory = collection
     real_dir = real_directory(directory)
     assert_allowed_to_read(real_dir)

+ 13 - 12
iepy/webui/brat/src/search.py

@@ -17,7 +17,8 @@ DEFAULT_EMPTY_STRING = "***"
 REPORT_SEARCH_TIMINGS = False
 DEFAULT_RE_FLAGS = re.UNICODE
 ###
-
+import logging
+logger = logging.getLogger(__name__)
 if REPORT_SEARCH_TIMINGS:
     from sys import stderr
     from datetime import datetime
@@ -172,20 +173,20 @@ def __filenames_to_annotations(filenames):
                 "").replace(
                 ".rel",
                 "")
-            ann_obj = annotation.TextAnnotations(nosuff_fn, read_only=True)
-            anns.append(ann_obj)
+            with annotation.TextAnnotations(nosuff_fn, read_only=True) as ann_obj:
+                anns.append(ann_obj)
         except annotation.AnnotationFileNotFoundError:
-            print("%s:\tFailed: file not found" % fn, file=sys.stderr)
+            logger.info("%s:\tFailed: file not found" % fn, file=sys.stderr)
         except annotation.AnnotationNotFoundError as e:
-            print("%s:\tFailed: %s" % (fn, e), file=sys.stderr)
+            logger.info("%s:\tFailed: %s" % (fn, e), file=sys.stderr)
 
     if len(anns) != len(filenames):
-        print("Note: only checking %d/%d given files" % (
+        logger.info("Note: only checking %d/%d given files" % (
             len(anns), len(filenames)), file=sys.stderr)
 
     if REPORT_SEARCH_TIMINGS:
         process_delta = datetime.now() - process_start
-        print("filenames_to_annotations: processed in", str(
+        logger.info("filenames_to_annotations: processed in", str(
             process_delta.seconds) + "." + str(process_delta.microseconds / 10000), "seconds", file=stderr)
 
     return anns
@@ -210,13 +211,13 @@ def __document_to_annotations(directory, document):
     """Given a directory and a document, returns an Annotations object for the
     file."""
     # TODO: put this shared functionality in a more reasonable place
-    from document import real_directory
-    from os.path import join as path_join
+    # from document import real_directory
+    # from os.path import join as path_join
 
-    real_dir = real_directory(directory)
-    filenames = [path_join(real_dir, document)]
+    # real_dir = real_directory(directory)
+    # filenames = [path_join(real_dir, document)]
 
-    return __filenames_to_annotations(filenames)
+    return __filenames_to_annotations([document])
 
 
 def __doc_or_dir_to_annotations(directory, document, scope):

+ 9 - 0
iepy/webui/brat/static/client/src/annotator_ui.js

@@ -1362,6 +1362,8 @@ var AnnotatorUI = (function($, window, undefined) {
 
         // TODO: sorting on click on header (see showFileBrowser())
       }
+
+
       var performNormSearch = function() {
         var val = $('#norm_search_query').val();
         var db = $('#span_norm_db').val();
@@ -2451,6 +2453,7 @@ var AnnotatorUI = (function($, window, undefined) {
         dispatcher.post('ajax', [spanOptions, 'edited']);
         dispatcher.post('hideForm');
         $('#waiter').dialog('open');
+        dispatcher.post('getChangedOfDocument',[doc]);
       };
 
       var reselectSpan = function() {
@@ -2459,6 +2462,7 @@ var AnnotatorUI = (function($, window, undefined) {
         $(editedSpan.rect).addClass('reselect');
         reselectedSpan = editedSpan;
         selectedFragment = null;
+        dispatcher.post('getChangedOfDocument',[doc]);
       };
 
       var splitForm = $('#split_form');
@@ -2504,6 +2508,7 @@ var AnnotatorUI = (function($, window, undefined) {
         var $roleButtons = $roles.find('input').button();
         
         dispatcher.post('showForm', [splitForm]);
+        dispatcher.post('getChangedOfDocument',[doc]);
       };
 
       var addFragment = function() {
@@ -2512,6 +2517,7 @@ var AnnotatorUI = (function($, window, undefined) {
         $(editedSpan.rect).addClass('reselect');
         reselectedSpan = editedSpan;
         selectedFragment = false;
+        dispatcher.post('getChangedOfDocument',[doc]);
       };
 
       var reselectFragment = function() {
@@ -2540,6 +2546,7 @@ var AnnotatorUI = (function($, window, undefined) {
         dispatcher.post('ajax', [spanOptions, 'edited']);
         dispatcher.post('hideForm');
         $('#waiter').dialog('open');
+        dispatcher.post('getChangedOfDocument',[doc]);
       };
 
       var spanChangeLock = function(evt) {
@@ -2599,6 +2606,7 @@ var AnnotatorUI = (function($, window, undefined) {
             });
             $lock.click(spanChangeLock);
             $($span).buttonset();
+            dispatcher.post('getChangedOfDocument',[doc]);
           },
           beforeClose: function(evt) {
             // in case the form is cancelled
@@ -2671,6 +2679,7 @@ var AnnotatorUI = (function($, window, undefined) {
 
         $('#waiter').dialog('open');
         dispatcher.post('ajax', [spanOptions, 'edited']);
+        dispatcher.post('getChangedOfDocument',[doc]);
         return false;
       };
       $('#span_notes').focus(function () {

+ 51 - 0
iepy/webui/brat/static/client/src/custom.js

@@ -0,0 +1,51 @@
+var customer = (function($, window, undefined) {
+    var customer = function(dispatcher, svgId) {
+
+        function clickDocument(document){
+            dispatcher.post('ajax', [{
+                action: 'getDocument',
+                collection: "/",
+                'document': document,
+            }, 'renderData', {
+                collection: "/",
+                'document': document
+            }]);
+        }
+        var receiveSelfLabel = function(data){
+            var list_header = ["id","document_id","edittime","deleted","added"]
+            var table_str = '<table class="ui-widget-content" style="width:100%">';
+            table_str += "<tr>";
+            for (var j in list_header){
+                table_str += "<td>"+list_header[j]+"</td>"
+            }
+            table_str += "</tr>";
+            for(var i =0;i<data.rowsData.length;i++){
+                table_str += "<tr>";
+                _row = data.rowsData[i];
+                for (var j in list_header){
+                    var _head = list_header[j]
+                    var _td = "<td>"
+                    if(_head=="document_id"){
+                        _td = "<td >"
+                        table_str += _td+"<input onclick='dispatcher.post(\"setDocument\",[this.value,this.value])' class='fullwidth' style='background: #5c9ccc' type='button' value='"+_row[list_header[j]].toString()+"'/>"+"</td>";
+                    }else{
+                        table_str += _td+_row[list_header[j]].toString()+"</td>";
+                    }
+
+                }
+                table_str += "</tr>";
+            }
+            table_str +="</table>";
+            $('#selfLabel_content').html(table_str);
+            $('#self_whole_count').val(data.all_num);
+            $('#self_whole_page').val(data.num_page);
+            $('#page_num').val(data.page_num);
+        }
+
+        dispatcher.
+        on('receiveSelfLabel', receiveSelfLabel).
+        on('clickDocument', clickDocument);
+    };
+
+    return customer;
+})(jQuery, window);

+ 1 - 0
iepy/webui/brat/static/client/src/url_monitor.js

@@ -59,6 +59,7 @@ var URLMonitor = (function($, window, undefined) {
           dispatcher.post('docChanged', [doc, oldDoc]);
         }
         setArguments(args || null);
+        dispatcher.post('getChangedOfDocument',[doc]);
       };
 
       var setCollection = function(coll, doc, args) {

+ 53 - 9
iepy/webui/brat/static/client/src/visualizer.js

@@ -1248,12 +1248,12 @@ var Visualizer = (function($, window, undefined) {
               var lastChar = fragment.to - fragment.chunk.from - 1;
 
               // Adjust for XML whitespace (#832, #1009)
-              var textUpToFirstChar = fragment.chunk.text.substring(0, firstChar);
-              var textUpToLastChar = fragment.chunk.text.substring(0, lastChar);
-              var textUpToFirstCharUnspaced = textUpToFirstChar.replace(/\s\s+/g, ' ');
-              var textUpToLastCharUnspaced = textUpToLastChar.replace(/\s\s+/g, ' ');
-              firstChar -= textUpToFirstChar.length - textUpToFirstCharUnspaced.length;
-              lastChar -= textUpToLastChar.length - textUpToLastCharUnspaced.length;
+              // var textUpToFirstChar = fragment.chunk.text.substring(0, firstChar);
+              // var textUpToLastChar = fragment.chunk.text.substring(0, lastChar);
+              // var textUpToFirstCharUnspaced = textUpToFirstChar.replace(/\s\s+/g, ' ');
+              // var textUpToLastCharUnspaced = textUpToLastChar.replace(/\s\s+/g, ' ');
+              // firstChar -= textUpToFirstChar.length - textUpToFirstCharUnspaced.length;
+              // lastChar -= textUpToLastChar.length - textUpToLastCharUnspaced.length;
               
               var startPos, endPos;
               if (firstChar < fragment.chunk.text.length) {
@@ -1276,9 +1276,10 @@ var Visualizer = (function($, window, undefined) {
                 to: Math.max(startPos, endPos)
               };
             } else { // it's markedText [id, start?, char#, offset, kind]
-              var textUpToChar = text.textContent.substring(0, fragment[2]);
-              var textUpToCharUnspaced = textUpToChar.replace(/\s\s+/g, ' ');
-              var pos = fragment[2] - (textUpToChar.length - textUpToCharUnspaced.length);
+              // var textUpToChar = text.textContent.substring(0, fragment[2]);
+              // var textUpToCharUnspaced = textUpToChar.replace(/\s\s+/g, ' ');
+              // var pos = fragment[2] - (textUpToChar.length - textUpToCharUnspaced.length);
+              var pos = fragment[2]
               if (pos < 0) pos = 0;
               if (!pos) { // start
                 fragment[3] = text.getStartPositionOfChar(pos).x;
@@ -3333,6 +3334,47 @@ Util.profileStart('before render');
         }
       };
 
+      function clickDocument(document){
+        dispatcher.post('ajax', [{
+          action: 'getDocument',
+          collection: "/",
+          'document': document,
+        }, 'renderData', {
+          collection: "/",
+          'document': document
+        }]);
+      }
+      var receiveSelfLabel = function(data){
+        var list_header = ["id","document_id","edittime","deleted","added"]
+        var table_str = '<table class="ui-widget-content" style="width:100%">';
+        table_str += "<tr>";
+        for (var j in list_header){
+          table_str += "<td>"+list_header[j]+"</td>"
+        }
+        table_str += "</tr>";
+        for(var i =0;i<data.rowsData.length;i++){
+          table_str += "<tr>";
+          _row = data.rowsData[i];
+          for (var j in list_header){
+            var _head = list_header[j]
+            var _td = "<td>"
+            if(_head=="document_id"){
+              _td = "<td >"
+              table_str += _td+"<input onclick='dispatcher.post(\"setDocument\",[this.value,this.value])' class='fullwidth' style='background: #5c9ccc' type='button' value='"+_row[list_header[j]].toString()+"'/>"+"</td>";
+            }else{
+              table_str += _td+_row[list_header[j]].toString()+"</td>";
+            }
+
+          }
+          table_str += "</tr>";
+        }
+        table_str +="</table>";
+        $('#selfLabel_content').html(table_str);
+        $('#self_whole_count').val(data.all_num);
+        $('#self_whole_page').val(data.num_page);
+        $('#page_num').val(data.page_num);
+      }
+
       var setAbbrevs = function(_abbrevsOn) {
         // TODO: this is a slightly weird place to tweak the configuration
         Configuration.abbrevsOn = _abbrevsOn;
@@ -3542,6 +3584,8 @@ Util.profileStart('before render');
           on('textBackgrounds', setTextBackgrounds).
           on('layoutDensity', setLayoutDensity).
           on('svgWidth', setSvgWidth).
+          on('receiveSelfLabel', receiveSelfLabel).
+          on('clickDocument', clickDocument).
           on('current', gotCurrent).
           on('clearSVG', clearSVG).
           on('mouseover', onMouseOver).

+ 123 - 2
iepy/webui/brat/static/client/src/visualizer_ui.js

@@ -921,6 +921,10 @@ var VisualizerUI = (function($, window, undefined) {
         dispatcher.post('clearSearch');
       });
 
+      $('#searchSelfLabel').click(function(evt){
+        dispatcher.post('showForm',[selfLabel_form]);
+      });
+
       $('#searchLabel').click(function(evt){
         dispatcher.post('searchLabel');
       });
@@ -1554,7 +1558,8 @@ var VisualizerUI = (function($, window, undefined) {
 
       var receiveDocument = function(data){
         dispatcher.post('setDocument', [data.doc,data.doc]);
-        dispatcher.post('searchLabel');
+        // dispatcher.post('searchLabel');
+        // dispatcher.post('getChangedOfDocument');
       }
       var moveDocument = function(dir){
         dispatcher.post('allowReloadByURL')
@@ -1569,11 +1574,110 @@ var VisualizerUI = (function($, window, undefined) {
         }]);
       };
 
+      var wage_form = $('#wage_form')
+
+      dispatcher.post(initForm, [wage_form, {
+        no_ok:true,
+        no_cancel:false,
+        resizable: false,
+        width: 400,
+        open: function(evt) {
+              keymap = {};
+        },
+        close: function(evt) {
+          keymap = {};
+        }
+      }]);
+
+      var selfLabel_form = $('#selfLabel_form')
+
+      dispatcher.post(initForm, [selfLabel_form, {
+        no_ok:true,
+        no_cancel:true,
+        resizable: false,
+        width: 700,
+        height:630,
+        x:100,
+        y:0,
+        open: function(evt) {
+          keymap = {};
+        },
+        close: function(evt) {
+          keymap = {};
+        }
+      }]);
+      
+      $('#to_search').click(function(){
+        dispatcher.post("ajax",[{
+          "action":"getSelfLabel",
+          "time_begin":$('#search_begin').val(),
+          "time_end":$('#search_end').val(),
+          "page_num":$('#page_num').val()
+        },'receiveSelfLabel',{
+          
+        }]);
+      })
+
+      $('#self_turn').click(function(){
+        dispatcher.post("ajax",[{
+          "action":"getSelfLabel",
+          "time_begin":$('#search_begin').val(),
+          "time_end":$('#search_end').val(),
+          "page_num":$('#page_num').val()
+        },'receiveSelfLabel',{
+
+        }]);
+      })
+
+      $('#self_last_page').click(function(){
+        var page_num = $('#page_num').val();
+        if(page_num!=""){
+          page_num = parseInt(page_num);
+          page_num -= 1;
+        }
+        dispatcher.post("ajax",[{
+          "action":"getSelfLabel",
+          "time_begin":$('#search_begin').val(),
+          "time_end":$('#search_end').val(),
+          "page_num":page_num,
+        },'receiveSelfLabel',{
+
+        }]);
+      })
+
+
+
+      $('#self_next_page').click(function(){
+        var page_num = $('#page_num').val();
+        if(page_num!=""){
+          page_num = parseInt(page_num);
+          page_num += 1;
+        }
+        dispatcher.post("ajax",[{
+          "action":"getSelfLabel",
+          "time_begin":$('#search_begin').val(),
+          "time_end":$('#search_end').val(),
+          "page_num":page_num,
+        },'receiveSelfLabel',{
+
+        }]);
+      })
+
+      $('#search_begin').datepicker({
+        dateFormat: 'yy-mm-dd',
+        inline: true
+      });
+      $('#search_end').datepicker({
+        dateFormat: 'yy-mm-dd',
+        inline: true
+      });
+
       var receiveLabel = function(data){
         $('#docCount').val(data.docCount);
         $('#eleCount').val(data.eleCount);
         $('#relCount').val(data.relCount);
         $('#wage').val(Math.round(data.wage,2));
+        dispatcher.post('showForm',[wage_form])
       };
       var searchLabel = function(){
         dispatcher.post('ajax',[{
@@ -1583,6 +1687,19 @@ var VisualizerUI = (function($, window, undefined) {
         }]);
       };
 
+      var receiveChangedOfD = function(data){
+        $('#document_deleted').val(data.deleted);
+        $('#document_added').val(data["added"])
+      };
+
+      var getChangedOfDocument = function(_doc){
+        dispatcher.post('ajax',[{
+          action:'getChangedOfDocument',
+          'document':_doc
+        },'receiveChangedOfD',{
+
+        }]);
+      };
 
 
       var moveInFileBrowser = function(dir) {
@@ -1824,7 +1941,7 @@ var VisualizerUI = (function($, window, undefined) {
           $cmpLink.button();
         }
           
-        $docName = $('#document_name input').val(coll + doc);
+        $docName = $('#document_id').val(coll + doc);
         var docName = $docName[0];
         // TODO do this on resize, as well
         // scroll the document name to the right, so the name is visible
@@ -1962,6 +2079,8 @@ var VisualizerUI = (function($, window, undefined) {
         showForm(aboutDialog);
       });
 
+
+
       // TODO: copy from annotator_ui; DRY it up
       var adjustFormToCursor = function(evt, element) {
         var screenHeight = $(window).height() - 8; // TODO HACK - no idea why -8 is needed
@@ -2408,7 +2527,9 @@ var VisualizerUI = (function($, window, undefined) {
           on('searchResultsReceived', searchResultsReceived).
           on('clearSearch', clearSearch).
           on('searchLabel',searchLabel).
+          on('getChangedOfDocument',getChangedOfDocument).
           on('receiveLabel',receiveLabel).
+          on('receiveChangedOfD',receiveChangedOfD).
           on('clearSVG', showNoDocMessage).
           on('screamingHalt', onScreamingHalt).
           on('configurationChanged', configurationChanged).

+ 1 - 1
iepy/webui/brat/static/style-ui.css

@@ -303,7 +303,7 @@ div.scroll_wrapper_upper div.scroller {
   margin-right: 100px;
 }
 #document_name input {
-  width: 100%;
+  width: 20%;
   border: none;
 }
 

+ 114 - 31
iepy/webui/brat/templates/brat/index.html

@@ -34,6 +34,7 @@ head.js(
         "{% static 'client/src/visualizer_ui.js' %}",
         "{% static 'client/src/annotator_ui.js' %}",
         "{% static 'client/src/spinner.js' %}",
+    "{% static 'client/src/custom.js' %}",
 );
 head.ready(function() {
   // var dispatcher = new Dispatcher(); // XXX DEBUG
@@ -45,6 +46,7 @@ head.ready(function() {
   var visualizerUI = new VisualizerUI(dispatcher, svg);
   var annotatorUI = new AnnotatorUI(dispatcher, svg);
   var spinner = new Spinner(dispatcher, '#spinner');
+  var customer = new customer(dispatcher,svg);
   var logger = new AnnotationLog(dispatcher);
   // Util.profileEnable();
   dispatcher.post('init');
@@ -116,8 +118,13 @@ njsw.parentNode.removeChild(njsw);
           <img id="prev" alt="Previous document (Cursor Left)" title="Previous document (Cursor Left)" src="{% static 'img/arrow-180.png' %}"/><img id="next" alt="Next document (Cursor Right)" title="Next document (Cursor Right)" src="{% static 'img/arrow.png' %}"/>
         </span>
   <div id="document_name">
-    <input readonly="readonly" class="ui-widget-header"></input>
+    <input id='document_id' readonly="readonly" class="ui-widget-header"></input>
+          deleted:
+          <input id='document_deleted' readonly="readonly" class="ui-widget-header"></input>
+          added:
+          <input id='document_added' readonly="readonly" class="ui-widget-header"></input>
   </div>
+
   <!-- <span id="document_ctime"/> -->
 </div>
 <div id="pulldown" class="unselectable ui-widget-content">
@@ -126,22 +133,17 @@ njsw.parentNode.removeChild(njsw);
 
 <!--      <input id="auth_button" type="button" value="Login"/>-->
         <input id="manage" type="button" value="manage" onclick="window.open(url='/admin')"/>
-
-        <input id="searchLabel" type="button" value="searchLabel"/>
-        <input type="text" value="文章数" disabled="disabled" style="width:10%"/>
-        <input id="docCount" type="text" disabled="disabled" style="width:10%"/>
-        <input type="text" value="要素数"  disabled="disabled" style="width:10%"/>
-        <input id="eleCount" type="text" disabled="disabled" style="width:10%"/>
-        <input type="text" value="关系数" disabled="disabled" style="width:10%"/>
-        <input id="relCount" type="text" disabled="disabled" style="width:10%"/>
-        <input type="text" value="合格结算价" disabled="disabled" style="width:10%"/>
-        <input id="wage" type="text" disabled="disabled" style="width:10%"/>
+        <input id="searchSelfLabel" type="button" value="search"/>
+<!--        <input id="search_button" type="checkbox" class="login" value="true" tabindex="-1"/>-->
+<!--        <label id="search_button_label" for="search_button" title="Search text and annotations">Search</label>-->
+<!--        <input id="clear_search_button" type="button" value="✕" tabindex="-1" title="Clear search" style="display: none"/>-->
+        <input id="searchLabel" type="button" value="wage"/>
 <!--      {#            <input id="collection_browser_button" type="button" value="Collection" tabindex="-1" title="Open Collection Browser (Tab)"/>#}-->
 <!--      {#            <input id="data_button" type="button" value="Data" tabindex="-1" title="Import, Export Data; Manage Collection"/>#}-->
-<!--      {#            <input id="search_button" type="checkbox" class="login" value="true" tabindex="-1"/><label id="search_button_label" for="search_button" title="Search text and annotations">Search</label><input id="clear_search_button" type="button" value="✕" tabindex="-1" title="Clear search" style="display: none"/>#}-->
-<!--      {#            <input id="unlock_type_button" type="button" value="Unlock" tabindex="-1" title="Stop annotating with the locked type"/>#}-->
+
+      <input id="unlock_type_button" type="button" value="Unlock" title="Stop annotating with the locked type"/>
       <!--<input id="undo_button" type="button" class="login" value="Undo" tabindex="-1"/>-->
-<!--      {#            <input id="options_button" type="button" value="Options" tabindex="-1" title="Set Visual, Annotation and Network Options"/>#}-->
+      <input id="options_button" type="button" value="Options" tabindex="-1" title="Set Visual, Annotation and Network Options"/>
     </div>
     <!-- Dummy span, fixes visual glitch (issue #535). TODO: better fix -->
     <span class="document_edit_time unselectable">&nbsp;</span>
@@ -303,28 +305,28 @@ njsw.parentNode.removeChild(njsw);
   </div>
   <div class="optionRow">
     <span class="optionLabel">Visualization width</span>
-    <input id="svg_width_value" maxlength="3" size="3" value="100"
+    <input id="svg_width_value" maxlength="3" size="3" value="100" onblur="if(this.value=='' || parseInt(this.value)<20){this.value='20';}"
            style="text-align:right"/>
     <span id="svg_width_unit" class="radio_group small-buttons">
             <input type="radio" id="svg_width_unit_percent" value="%"
                    name="svg_width_radio" checked="checked"/>
             <label for="svg_width_unit_percent">percent</label>
-            <input type="radio" id="svg_width_unit_pixels" value="px"
-                   name="svg_width_radio"/>
-            <label for="svg_width_unit_pixels">pixels</label>
+<!--            <input type="radio" id="svg_width_unit_pixels" value="px"-->
+<!--                   name="svg_width_radio"/>-->
+<!--            <label for="svg_width_unit_pixels">pixels</label>-->
           </span>
   </div>
-  <div class="optionRow small-buttons">
-    <span class="optionLabel">Paging</span>
-    size
-    <input id="paging_size" maxlength="3" size="3" value="10"
-           style="text-align:right"/>,
-    step
-    <input id="paging_step" maxlength="3" size="3" value="5"
-           style="text-align:right"/>
-    sentences
-    <input type="button" id="paging_clear" value="Clear"/>
-  </div>
+<!--  <div class="optionRow small-buttons">-->
+<!--    <span class="optionLabel">Paging</span>-->
+<!--    size-->
+<!--    <input id="paging_size" maxlength="3" size="3" value="10"-->
+<!--           style="text-align:right"/>,-->
+<!--    step-->
+<!--    <input id="paging_step" maxlength="3" size="3" value="5"-->
+<!--           style="text-align:right"/>-->
+<!--    sentences-->
+<!--    <input type="button" id="paging_clear" value="Clear"/>-->
+<!--  </div>-->
 </fieldset>
 <fieldset id="options_form_annotation" class="login">
   <legend>Annotation options</legend>
@@ -352,7 +354,7 @@ njsw.parentNode.removeChild(njsw);
            style="text-align:right"/>
   </div>
 </fieldset>
-<fieldset id="options_form_network">
+<fieldset id="options_form_network" style="display:none">
   <legend>Network options</legend>
   <div class="optionRow">
     <span class="optionLabel">Collaboration</span>
@@ -369,6 +371,87 @@ njsw.parentNode.removeChild(njsw);
 <fieldset>
   <textarea id="more_info_readme" readonly="readonly" class="borderless"></textarea>
 </fieldset>
+</form>
+
+<form id="wage_form" class="dialog" title="wage">
+    <div>
+        <table class="fullwidth">
+            <tr>
+                <td>
+                    <input type="text" class="fullwidth" value="文章数" disabled="disabled" style="background: #5c9ccc"/>
+                </td>
+                <td>
+                    <input id="docCount" class="fullwidth" type="text" disabled="disabled" style="background: #5c9ccc"/>
+                </td>
+            </tr>
+            <tr>
+                <td>
+                    <input type="text" class="fullwidth" value="要素数"  disabled="disabled" style="background: #5c9ccc"/>
+                </td>
+                <td>
+                    <input id="eleCount" class="fullwidth" type="text" disabled="disabled" style="background: #5c9ccc"/>
+                </td>
+            </tr >
+            <tr>
+                <td>
+                    <input type="text" class="fullwidth" value="关系数" disabled="disabled"  style="background: #5c9ccc"/>
+                </td>
+                <td>
+                    <input id="relCount" class="fullwidth" type="text" disabled="disabled" style="background: #5c9ccc"/>
+                </td>
+            </tr>
+            <tr >
+                <td >
+                    <input type="text" class="fullwidth" value="合格结算价" disabled="disabled" style="background: #5c9ccc"/>
+                </td>
+                <td >
+                    <input id="wage" class="fullwidth" type="text" disabled="disabled" style="background: #5c9ccc"/>
+                </td>
+            </tr>
+        </table>
+    </div>
+</form>
+
+
+<form id="selfLabel_form" class="dialog" title="selfLabel">
+    <div class="fullwidth" >
+        <table class="ui-widget-header" style="width:100%">
+            <tr>
+                <td >
+                    <input type="text" class="fullwidth" value="开始时间" readonly="readonly" style="background: #5c9ccc"/>
+                </td>
+                <td >
+                    <input id="search_begin" type="text"/>
+                </td >
+                <td >
+                    <input type="text" class="fullwidth" value="结束时间" readonly="readonly" style="background: #5c9ccc"/>
+                </td >
+                <td >
+                    <input id="search_end" type="text"/>
+                </td>
+                <td >
+                    <input id="to_search" type="button" class="fullwidth" value="查询"/>
+                </td>
+            </tr>
+        </table>
+    </div>
+    <div id="selfLabel_content" class="fullwidth" style="height:500px;overflow: scroll">
+
+    </div>
+    <div class="ui-widget-header" style="width:100%">
+        <tr>
+            <td style="width:60%">
+                记录共计<input id="self_whole_count" readonly="readonly" size="5"/>条,
+                <input id="self_whole_page" readonly="readonly" size="5"/>页
+            </td>
+            <td style="width:40%">
+                <input id="self_last_page" type="button" value="<"/>
+                <input id="page_num" type="number" size="5"/>
+                <input id="self_next_page" type="button" value=">"/>
+                <input id="self_turn" type="button" value="转到"/>
+            </td>
+        </tr>
+    </div>
 </form>
         <!-- Search dialog -->
 <form id="search_form" class="dialog" title="Search">
@@ -565,7 +648,7 @@ njsw.parentNode.removeChild(njsw);
           </span>
     </div>
     <div id="context_size_div" class="optionRow">
-      <span class="optionLabel" style="margin-left:1em;">Context length</span> <input id="context_length" maxlength="3" size="3" value="50"/> characters
+      <span class="optionLabel" style="margin-left:1em;">Context length</span> <input id="context_length" maxlength="3" minlenght="2" size="3" value="50"/> characters
     </div>
     <div class="optionRow">
       <span class="optionLabel">Match text as</span>

+ 2 - 2
iepy/webui/brat/views.py

@@ -87,8 +87,8 @@ def ajax_dispatch(request):
         from sys import path as sys_path
         log_critical('Heisenbug trap reports: ' + str(sys_path))
         raise
-    client_ip = request.META["REMOTE_ADDR"]
-    client_hostname = request.META["REMOTE_HOST"]
+    client_ip = request.META.get("REMOTE_ADDR",'')
+    client_hostname = request.META.get("REMOTE_HOST",'')
     # init_session(client_ip, cookie_data=cookie_data)
     # init_session_iepy(request)
     response_is_JSON = True

部分文件因文件數量過多而無法顯示