5 年之前 · 793ad3a456
--- a/examples/__init__.py
+++ b/examples/__init__.py
--- a/examples/coreline/__init__.py
+++ b/examples/coreline/__init__.py
--- a/examples/coreline/annotation.conf
+++ b/examples/coreline/annotation.conf
--- a/examples/coreline/articles.csv
+++ b/examples/coreline/articles.csv
--- a/examples/coreline/bin/2020-08-01-2020-08-31要素标注统计.xls
+++ b/examples/coreline/bin/2020-08-01-2020-08-31要素标注统计.xls
--- a/examples/coreline/bin/None-2020-09-25要素标注统计.xls
+++ b/examples/coreline/bin/None-2020-09-25要素标注统计.xls
--- a/examples/coreline/bin/None-2020-10-31要素标注统计.xls
+++ b/examples/coreline/bin/None-2020-10-31要素标注统计.xls
--- a/examples/coreline/bin/None-2020-11-25要素标注统计.xls
+++ b/examples/coreline/bin/None-2020-11-25要素标注统计.xls
--- a/examples/coreline/bin/None-2020-12-25要素标注统计.xls
+++ b/examples/coreline/bin/None-2020-12-25要素标注统计.xls
--- a/examples/coreline/bin/__init__.py
+++ b/examples/coreline/bin/__init__.py
--- a/examples/coreline/bin/csv_to_iepy.py
+++ b/examples/coreline/bin/csv_to_iepy.py
--- a/examples/coreline/bin/gazettes_loader.py
+++ b/examples/coreline/bin/gazettes_loader.py
--- a/examples/coreline/bin/iepy_rules_runner.py
+++ b/examples/coreline/bin/iepy_rules_runner.py
--- a/examples/coreline/bin/iepy_runner.py
+++ b/examples/coreline/bin/iepy_runner.py
--- a/examples/coreline/bin/manage.py
+++ b/examples/coreline/bin/manage.py
--- a/examples/coreline/bin/preprocess.py
+++ b/examples/coreline/bin/preprocess.py
--- a/examples/coreline/bin/rules_verifier.py
+++ b/examples/coreline/bin/rules_verifier.py
--- a/examples/coreline/bin/settlement.py
+++ b/examples/coreline/bin/settlement.py
@@ -243,9 +243,46 @@ class Settlement():
 
				 if __name__=="__main__":
			
 
				     settle = Settlement()
			
 
				     # settle.makeMigrate("test","2020-08-01","2020-08-31")
			
 
				-    settle.makePayroll(["test3","test19","test22","test2","test9","test11","test12","test1","test7","test21","test17"],"2020-08-01","2020-12-25")
			
 
				-    # settle.makePayrolls("2020-08-01","2020-08-31")
			
 
				-    settle.exportPayroll(begin_time=None,end_time='2020-12-25')
			
 
				+    # settle.makePayroll(["test3","test19","test22","test2","test9","test11","test12","test1","test7","test21","test17"],"2020-08-01","2020-12-25")
			
 
				+    # settle.exportPayroll(begin_time=None,end_time='2020-12-25')
			
 
				     # settle.createUser_batch(batch_size=102)
			
 
				     # settle.exportLabels()
			
 
				-    # settle.filter()
			
 
				+    # settle.filter()
			
 
				+
			
 
				+    from brat.models import BratAnnotation as brat_annotations
			
 
				+    from iepy.webui.corpus.models import IEDocument
			
 
				+    filename = '74800260'
			
 
				+    pre_label = IEDocument.objects.filter(human_identifier='74800260').values("pre_label")
			
 
				+    _label = ""
			
 
				+    if len(pre_label)>0:
			
 
				+        if pre_label[0]["pre_label"] is not None:
			
 
				+            _label = pre_label[0]["pre_label"]
			
 
				+    dict_T = dict()
			
 
				+    dict_R = dict()
			
 
				+    set_pre_label = set()
			
 
				+    for _i in _label.split(";"):
			
 
				+        if _i!="":
			
 
				+            set_pre_label.add(_i)
			
 
				+    set_label = set()
			
 
				+    anns = brat_annotations.objects.filter(document_id='74800260').values("value")
			
 
				+    for _str in anns:
			
 
				+        _str = _str["value"].strip()
			
 
				+        if _str != "":
			
 
				+            if _str[0]=="T":
			
 
				+                match = re.search(T_pattern,_str)
			
 
				+                if match is not None:
			
 
				+                    match = match.groupdict()
			
 
				+                    dict_T[match["T"]] = {"type":match["type"],"begin":match["begin"],"end":match["end"]}
			
 
				+            if _str[0]=="R":
			
 
				+                match = re.search(R_pattern,_str)
			
 
				+                if match is not None:
			
 
				+                    match = match.groupdict()
			
 
				+                    dict_R[match["R"]] = {"type":match["type"],"arg1":match["arg1"],"arg2":match["arg2"]}
			
 
				+    for _T,_v in dict_T.items():
			
 
				+        set_label.add("T|%s|%d|%d"%(_v["type"],int(_v["begin"]),int(_v["end"])))
			
 
				+    for _R,_v in dict_R.items():
			
 
				+        set_label.add("R|%s|%d|%d|%d|%d"%(_v["type"],int(dict_T[_v["arg1"]]["begin"]),int(dict_T[_v["arg1"]]["end"]),int(dict_T[_v["arg2"]]["begin"]),int(dict_T[_v["arg2"]]["end"])))
			
 
				+    union_set = set_pre_label&set_label
			
 
				+    deleted = len(set_pre_label)-len(union_set)
			
 
				+    added = len(set_label)-len(union_set)
			
 
				+    print(deleted,added)
			
--- a/examples/coreline/extractor_config.json
+++ b/examples/coreline/extractor_config.json
--- a/examples/coreline/gunicorn_django.py
+++ b/examples/coreline/gunicorn_django.py
@@ -0,0 +1,20 @@
 
				+
			
 
				+
			
 
				+# gunicorn_config.py
			
 
				+import logging
			
 
				+import logging.handlers
			
 
				+from logging.handlers import WatchedFileHandler
			
 
				+import os
			
 
				+import multiprocessing
			
 
				+bind = '127.0.0.1:8001'      #绑定ip和端口号
			
 
				+backlog = 512                #监听队列
			
 
				+# chdir = '/label/iepy-develop/examples/coreline/bin'  #gunicorn要切换到的目的工作目录
			
 
				+timeout = 30      #超时
			
 
				+worker_class = 'gevent' #使用gevent模式，还可以使用sync 模式，默认的是sync模式
			
 
				+
			
 
				+workers = multiprocessing.cpu_count() * 2 + 1    #进程数
			
 
				+threads = 2 #指定每个进程开启的线程数
			
 
				+loglevel = 'info' #日志级别，这个日志级别指的是错误日志的级别，而访问日志的级别无法设置
			
 
				+access_log_format = '%(t)s %(p)s %(h)s "%(r)s" %(s)s %(L)s %(b)s %(f)s" "%(a)s"'
			
 
				+accesslog = "/label/iepy-develop/examples/coreline/log/gunicorn_access.log"      #访问日志文件
			
 
				+errorlog = "/label/iepy-develop/examples/coreline/log/gunicorn_error.log"        #错误日志文件
			
--- a/examples/coreline/rules.py
+++ b/examples/coreline/rules.py
--- a/examples/coreline/settings.py
+++ b/examples/coreline/settings.py
@@ -15,6 +15,11 @@ SECRET_KEY = 'u==!fueit=wxo&j8!5u+sfasp4prjluk@*s=7!-wz_&r@pn))r'
 
				 DEBUG = True
			
 
				 TEMPLATE_DEBUG = True
			
 
				 
			
 
				+STATIC_URL = '/static/'
			
 
				+
			
 
				+# BASE_DIR 是项目的绝对地址
			
 
				+STATIC_ROOT = os.path.join(BASE_DIR, 'static')
			
 
				+
			
 
				 # Database
			
 
				 # https://docs.djangoproject.com/en/1.7/ref/settings/#databases
			
 
				 # DATABASES = {
			
@@ -54,6 +59,7 @@ Disallow: /confidential/
 
				 """,
			
 
				     "annotation.conf":"""
			
 
				 [spans]
			
 
				+product
			
 
				 code
			
 
				 name
			
 
				 money
			
@@ -158,6 +164,7 @@ time_bidclose | 截标时间
 
				 moneysource | 资金来源
			
 
				 bidway | 招标方式
			
 
				 serviceTime | 服务期限
			
 
				+product | 产品
			
 
				 
			
 
				 #Protein | Protein | Pro | P
			
 
				 #Protein_binding | Protein binding | Binding | Bind
			
--- a/examples/coreline/test.sqlite
+++ b/examples/coreline/test.sqlite
--- a/examples/coreline/wsgi.py
+++ b/examples/coreline/wsgi.py
@@ -0,0 +1,7 @@
 
				+import os
			
 
				+
			
 
				+from django.core.wsgi import get_wsgi_application
			
 
				+
			
 
				+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "coreline.settings")
			
 
				+
			
 
				+application = get_wsgi_application()
			
--- a/examples/nginx.conf
+++ b/examples/nginx.conf
@@ -0,0 +1,69 @@
 
				+
			
 
				+#user  nobody;
			
 
				+worker_processes  10;
			
 
				+
			
 
				+#error_log  logs/error.log;
			
 
				+#error_log  logs/error.log  notice;
			
 
				+#error_log  logs/error.log  info;
			
 
				+
			
 
				+#pid        logs/nginx.pid;
			
 
				+
			
 
				+
			
 
				+events {
			
 
				+    worker_connections  1024;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+http {
			
 
				+    include      mime.types;
			
 
				+    default_type  application/octet-stream;
			
 
				+
			
 
				+    log_format  access   '{"ip":"$remote_addr", "time":"[$time_iso8601]" ,"path":"$request","infosoruce":"$http_user_agent"}';
			
 
				+
			
 
				+    #access_log  logs/ip.log access;
			
 
				+
			
 
				+    sendfile        on;
			
 
				+    #tcp_nopush     on;
			
 
				+
			
 
				+    #keepalive_timeout  0;
			
 
				+    keepalive_timeout  65;
			
 
				+
			
 
				+    gzip  on;
			
 
				+
			
 
				+
			
 
				+    client_max_body_size 8M;
			
 
				+    client_body_buffer_size 128k;
			
 
				+
			
 
				+	upstream mysvr {   
			
 
				+	  server 127.0.0.1:8002;
			
 
				+	}
			
 
				+
			
 
				+    server {
			
 
				+		# 端口和域名
			
 
				+		listen 8000;
			
 
				+		server_name 127.0.0.1;
			
 
				+
			
 
				+		# 日志
			
 
				+		access_log /label/iepy-develop/examples/coreline/log/gunicorn_access.log;
			
 
				+		error_log /label/iepy-develop/examples/coreline/log/gunicorn_error.log;
			
 
				+
			
 
				+		# 不记录访问不到 favicon.ico 的报错日志
			
 
				+		#location = /favicon.ico { access_log off; log_not_found off; }
			
 
				+		
			
 
				+		
			
 
				+		location /static/ {
			
 
				+			root /label/iepy-develop/iepy/webui;
			
 
				+		}
			
 
				+		location /media/ {
			
 
				+			root /home/wardseptember/django-blog;
			
 
				+		}
			
 
				+		# gunicorn 中生成的文件的地址
			
 
				+		location / {
			
 
				+			proxy_pass  http://mysvr;  #请求转向mysvr 定义的服务器列表
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+}
			
--- a/examples/start.md
+++ b/examples/start.md
@@ -0,0 +1,5 @@
 
				+
			
 
				+#启动服务
			
 
				+gunicorn -w 5 --preload -b 0.0.0.0:8002 coreline.wsgi
			
 
				+#启动nginx
			
 
				+/usr/local/nginx/sbin/nginx -s reload
			
--- a/examples/test/bin/分组_1.xls
+++ b/examples/test/bin/分组_1.xls
--- a/examples/test/bin/分组_10.xls
+++ b/examples/test/bin/分组_10.xls
--- a/examples/test/bin/分组_2.xls
+++ b/examples/test/bin/分组_2.xls
--- a/examples/test/bin/分组_3.xls
+++ b/examples/test/bin/分组_3.xls
--- a/examples/test/bin/分组_4.xls
+++ b/examples/test/bin/分组_4.xls
--- a/examples/test/bin/分组_5.xls
+++ b/examples/test/bin/分组_5.xls
--- a/examples/test/bin/分组_6.xls
+++ b/examples/test/bin/分组_6.xls
--- a/examples/test/bin/分组_7.xls
+++ b/examples/test/bin/分组_7.xls
--- a/examples/test/bin/分组_8.xls
+++ b/examples/test/bin/分组_8.xls
--- a/examples/test/bin/分组_9.xls
+++ b/examples/test/bin/分组_9.xls
--- a/iepy/data/models.py
+++ b/iepy/data/models.py
@@ -37,9 +37,9 @@ class EntityKind(BaseModel):
 
				         return self.name
			
 
				 
			
 
				 class Payroll(BaseModel):
			
 
				-    user = models.CharField(max_length=CHAR_MAX_LENGHT)
			
 
				-    begin_time = models.CharField(max_length=10)
			
 
				-    end_time = models.CharField(max_length=10)
			
 
				+    user = models.CharField(max_length=CHAR_MAX_LENGHT,db_index=True)
			
 
				+    begin_time = models.CharField(max_length=10,db_index=True)
			
 
				+    end_time = models.CharField(max_length=10,db_index=True)
			
 
				     doc_count = models.IntegerField()
			
 
				     t_count = models.IntegerField()
			
 
				     r_count = models.IntegerField()
			
@@ -101,18 +101,23 @@ class IEDocument(BaseModel):
 
				                                     on_delete=models.PROTECT)
			
 
				     human_identifier = models.CharField(
			
 
				         max_length=CHAR_MAX_LENGHT,
			
 
				-        unique=True
			
 
				+        unique=True,
			
 
				+        db_index=True
			
 
				     )
			
 
				 
			
 
				     sourcetext = models.TextField(null=True)
			
 
				-    edituser = models.TextField(null=True)
			
 
				-    edittime = models.DateTimeField(null=True,blank=True)
			
 
				-    reedittime = models.DateTimeField(null=True,blank=True)
			
 
				-    brat_done_at = models.DateTimeField(null=True, blank=True)
			
 
				+    edituser = models.TextField(null=True,db_index=True)
			
 
				+    edittime = models.DateTimeField(null=True,blank=True,db_index=True)
			
 
				+    reedittime = models.DateTimeField(null=True,blank=True,db_index=True)
			
 
				+    brat_done_at = models.DateTimeField(null=True, blank=True,db_index=True)
			
 
				 
			
 
				     text = models.TextField()
			
 
				     creation_date = models.DateTimeField(auto_now_add=True)
			
 
				 
			
 
				+    pre_label = models.TextField(blank=True)
			
 
				+    deleted = models.IntegerField(default=0)
			
 
				+    added = models.IntegerField(default=0)
			
 
				+
			
 
				     # The following 3 lists have 1 item per token
			
 
				     tokens = ListField(blank=True)  # strings
			
 
				     lemmas = ListField(blank=True)  # strings
			
--- a/iepy/selfpreprocess/BiddingKG/dl/complaint/models/punish_code.pb
+++ b/iepy/selfpreprocess/BiddingKG/dl/complaint/models/punish_code.pb
--- a/iepy/selfpreprocess/BiddingKG/dl/complaint/punish_predictor.py
+++ b/iepy/selfpreprocess/BiddingKG/dl/complaint/punish_predictor.py
@@ -0,0 +1,473 @@
 
				+#!/usr/bin/python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author  : bidikeji
			
 
				+# @Time    : 2021/1/25 0025 16:35 
			
 
				+
			
 
				+#!/usr/bin/python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author  : bidikeji
			
 
				+# @Time    : 2020/12/24 0024 15:23
			
 
				+import re
			
 
				+import os
			
 
				+import time
			
 
				+import tensorflow as tf
			
 
				+# from BiddingKG.dl.common.Utils import *
			
 
				+from tensorflow.contrib.crf import crf_log_likelihood
			
 
				+from tensorflow.contrib.layers.python.layers import initializers
			
 
				+# from keras.preprocessing.sequence import pad_sequences
			
 
				+# import BiddingKG.dl.interface.Preprocessing as Preprocessing
			
 
				+from BiddingKG.dl.interface.Preprocessing import *
			
 
				+
			
 
				+
			
 
				+def decode(logits, trans, sequence_lengths, tag_num):
			
 
				+    viterbi_sequences = []
			
 
				+    for logit, length in zip(logits, sequence_lengths):
			
 
				+        score = logit[:length]
			
 
				+        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
			
 
				+        viterbi_sequences.append(viterbi_seq)
			
 
				+    return viterbi_sequences
			
 
				+
			
 
				+class Punish_Extract():
			
 
				+    def __init__(self, model_file = os.path.dirname(__file__)+"/models/punish_code.pb"):
			
 
				+        print('model_file_path:',model_file)
			
 
				+        self.sess = tf.Session(graph=tf.Graph())
			
 
				+        self.code = ""
			
 
				+        self.punish_dicition = ""
			
 
				+        self.model_file = model_file #预测编号模型
			
 
				+        self.load_model()
			
 
				+
			
 
				+    # 加载处罚编号预测模型
			
 
				+    def load_model(self):
			
 
				+        log("get model of time")
			
 
				+        with self.sess.as_default():
			
 
				+            with self.sess.graph.as_default():
			
 
				+                output_graph_def = tf.GraphDef()
			
 
				+                with open(self.model_file, 'rb') as f:
			
 
				+                    output_graph_def.ParseFromString(f.read())
			
 
				+                    tf.import_graph_def(output_graph_def, name="")
			
 
				+                    self.sess.run(tf.global_variables_initializer())
			
 
				+                    self.char_input = self.sess.graph.get_tensor_by_name("char_input:0")
			
 
				+                    self.length = self.sess.graph.get_tensor_by_name("length:0")
			
 
				+                    self.trans = self.sess.graph.get_tensor_by_name("crf_loss/transitons:0")
			
 
				+                    self.logits = self.sess.graph.get_tensor_by_name("CRF/output/logits:0")
			
 
				+
			
 
				+    # 处罚编号预测
			
 
				+    def predict_punishCode(self,list_sentences, MAX_AREA=5000):
			
 
				+        '''
			
 
				+        每个句子预测处罚编号
			
 
				+        :param list_sentences: 多篇文章句子列表[[每篇文章句子列表]]
			
 
				+        :param MAX_AREA: 控制最大每个句子长度，超过截断
			
 
				+        :return: 处罚编号字符串，若有多个；号隔开
			
 
				+        '''
			
 
				+        re_ner = re.compile("12+?3")
			
 
				+        article_ner_list = []
			
 
				+        count = 0
			
 
				+        with self.sess.as_default():
			
 
				+            with self.sess.graph.as_default():
			
 
				+                for sentences in list_sentences:
			
 
				+                    count += 1
			
 
				+                    # print(count)
			
 
				+                    sentences.sort(key=lambda x: len(x.sentence_text), reverse=True)
			
 
				+                    _begin_index = 0
			
 
				+                    while True:
			
 
				+                        MAX_LEN = len(sentences[_begin_index].sentence_text)
			
 
				+                        if MAX_LEN > MAX_AREA:
			
 
				+                            MAX_LEN = MAX_AREA
			
 
				+                        _LEN = MAX_AREA // MAX_LEN
			
 
				+                        sentence_len = [len(sentence.sentence_text) for sentence in sentences[_begin_index:_begin_index+_LEN]]
			
 
				+                        sentences_x = []
			
 
				+                        for sentence in sentences[_begin_index:_begin_index+_LEN]:
			
 
				+                            sentence = sentence.sentence_text
			
 
				+                            sentence = list(sentence)
			
 
				+                            sentence2id = [getIndexOfWord(word) for word in sentence]
			
 
				+                            sentences_x.append(sentence2id)
			
 
				+                        sentences_x = pad_sequences(sentences_x, maxlen=MAX_LEN, padding="post", truncating="post")
			
 
				+                        sentences_x = [np.array(x) for x in sentences_x]
			
 
				+                        _logits, _trans = self.sess.run([self.logits, self.trans],
			
 
				+                                                   feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
			
 
				+                        viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
			
 
				+
			
 
				+                        ner_list = []
			
 
				+                        for _seq, sentence in zip(viterbi_sequence, sentences[_begin_index:_begin_index+_LEN]):
			
 
				+                            sentence = sentence.sentence_text
			
 
				+                            seq_id = ''.join([str(s) for s in _seq])
			
 
				+                            if re_ner.search(seq_id):
			
 
				+                                # print("sentence: ",sentence)
			
 
				+                                for _ner in re_ner.finditer(seq_id):
			
 
				+                                    start = _ner.start()
			
 
				+                                    end = _ner.end()
			
 
				+                                    n = sentence[start:end]
			
 
				+                                    # print(n,'<==>',start,end)
			
 
				+                                    # ner_list.append((n, start, end))
			
 
				+                                    ner_list.append(n)  # 改为只返回实体字符
			
 
				+                        # article_ner_list.append(ner_list)
			
 
				+                        article_ner_list.append('；'.join(set(ner_list)))
			
 
				+                        if _begin_index+_LEN >= len(sentences):
			
 
				+                            break
			
 
				+                        _begin_index += _LEN
			
 
				+        return article_ner_list[0]
			
 
				+
			
 
				+    # 处罚类型
			
 
				+    def get_punishType(self, x1, x2):
			
 
				+        '''通过文章标题及内容判断文章类别
			
 
				+        x1: 标题
			
 
				+        x2: 内容
			
 
				+        return 类别'''
			
 
				+        # x1 = x1.replace('(','（').replace(')', '）').replace(' ','')
			
 
				+        # x2 = x2.replace('(', '（').replace(')', '）').replace(' ', '')
			
 
				+        '''标题正则'''
			
 
				+        # 未知公告
			
 
				+        unknow = re.compile('采购方式|采购公告|采购招标|磋商公告|谈判公告|交易公告$|征集|征求|招标公告|竞标公告|中标公告|'
			
 
				+                            '成交公告|成交信息|流标公告|废标公告|城市管理考评|决算表|决算|预算|资格考试|招聘|选聘'
			
 
				+                            '|聘请|拟录用|无违规违法|无此项信息|暂无工程投标违法|管理办法|指导意见|无投诉|投诉办法'
			
 
				+                            '公共资源交易情况|绩效评价|考试成绩|付息公告|不动产|办证|印发|转发')  #|结果公示 部分是
			
 
				+        # 投诉处理
			
 
				+        tscl = re.compile('投诉不予[处受]理|投诉不成立|终止投诉|投诉终止|不予受理|投诉事?项?的?处理')
			
 
				+        # 行政处罚
			
 
				+        xzcf = re.compile('行政处罚|行政处理|政处罚|行政裁决|防罚|公罚|医罚|环罚|政罚|文罚|局罚|旅罚|财罚|运罚')
			
 
				+        # 监督检查
			
 
				+        jdjc = re.compile('(监督检查的?问?题?(处理|整改|记分|结果|决定|处罚))|监督处罚|调查处理|监督处理')
			
 
				+        # 严重违法
			
 
				+        yzwf = re.compile('严重违法失信|黑名单|失信名单')
			
 
				+        # 不良行为
			
 
				+        blxw = re.compile('((不良|失信|不诚信|差错|不规范|违规|违约|处罚|违法)(行为|记录|信息))|((违约|违规|违法)(处理|操作|情况|问题))'
			
 
				+                          '|通报批评|记分管理|迟到|早退|缺席|虚假材料|弄虚作假|履职不到位|诚信考核扣分|串通投标'
			
 
				+                          '|审核不通过|码一致|地址一致|扣分处理|扣分通知|扣[0-9]+分|责令整改|信用信息认定书$'
			
 
				+                          '|关于.{,30}的处罚|关于.{,10}的?考评通报|关于.{,30}扣分情况|不规范代理行为'
			
 
				+                          '|(取消|暂停|限制).{,50}((专家|评标|评委|投标|竞价|被抽取|中标|供应商|候选人)资格)'
			
 
				+                          '|(代理服?务?机构).{,10}(扣分)|(专家).{,30}(扣分|记分|处罚)|对.{,30}处理|冻结.{,30}账号')
			
 
				+        # 其他不良行为
			
 
				+        other = re.compile('质疑|代理机构进场交易情况|网上投诉办理|信用奖惩|信用奖罚|进场工作.{,5}考核'
			
 
				+                           '|举报处理|结果无效|成交无效|行政复议')
			
 
				+
			
 
				+        '''正文内容正则'''
			
 
				+        # 投诉处理
			
 
				+        tscl_c = re.compile('(投诉(人|单位)[1-9]?(名称)?[:：])|(投诉事项[1-5一二三四五、]*部?分?(成立|予以受理))'
			
 
				+                            '|((驳回|撤回|撤销|终止)[^，。]{,60}(投诉|质疑))')
			
 
				+        # 行政处罚
			
 
				+        xzcf_c = re.compile('((处理依据及结果|处理结果|处罚结果)).*行政处罚|如下行政处罚|行政处罚决定')
			
 
				+        # 诚信加分
			
 
				+        cxjf_c = re.compile('处罚结果.*诚信加分')
			
 
				+        # 严重违法失信
			
 
				+        yzwf_c = re.compile('工商部门严重违法失信起名单|严重违法失信的具体情形') #|严重违法失信的具体情形
			
 
				+        # 不良行为
			
 
				+        blxw_c = re.compile('(取消|暂停|限制).{,30}((专家|评标|评委|投标|采购|竞价|被抽取|中标|供应商)的?资格)'
			
 
				+                            '|(处罚结果|处罚情况).*(扣[1-9]*分|记分|不良行为|不良记录|不良信用|不诚信|扣除信用'
			
 
				+                            '|诚信档案|信用信息|取消.*资格|口头警告|处罚机关|责令改正|罚款|限制投标|暂扣|禁止'
			
 
				+                            '|暂停|封禁|暂无|行政处罚)|处罚结果'
			
 
				+                            '|处罚主题|禁止参与.{,10}政府采购活动|列入不良行为|处罚如下|如下处罚|违规处罚|处罚违规'
			
 
				+                            '|责令改正|责令整改|处罚依据|进行以下处理|处理依据及结果|处理结果|处罚决定书|'
			
 
				+                            '(不规范|不良|不诚信)行为记录')
			
 
				+        # 其他不良行为
			
 
				+        other_c = re.compile('质疑(人|单位)[1-9]?(名称)?：|公告期内受质疑')
			
 
				+
			
 
				+        if re.search(unknow, x1):
			
 
				+            return re.search(unknow, x1).group(0), '未知类别'
			
 
				+        elif re.search(yzwf, x1):
			
 
				+            return re.search(yzwf, x1).group(0), '严重违法'
			
 
				+        elif re.search(yzwf_c, x2):
			
 
				+            return re.search(yzwf_c, x2).group(0), '严重违法'
			
 
				+
			
 
				+        elif re.search(tscl, x1):
			
 
				+            return re.search(tscl, x1).group(0), '投诉处理'
			
 
				+        elif re.search(xzcf, x1):
			
 
				+            return re.search(xzcf, x1).group(0), '行政处罚'
			
 
				+        elif re.search(jdjc, x1):
			
 
				+            return re.search(jdjc, x1).group(0), '监督检查'
			
 
				+        elif re.search(blxw, x1):
			
 
				+            return re.search(blxw, x1).group(0), '不良行为'
			
 
				+        elif re.search(other, x1):
			
 
				+            return re.search(other, x1).group(0), '其他不良行为'
			
 
				+
			
 
				+        elif re.search(tscl_c, x2):
			
 
				+            return re.search(tscl_c, x2).group(0), '投诉处理'
			
 
				+        elif re.search(xzcf_c, x2):
			
 
				+            return re.search(xzcf_c, x2).group(0), '行政处罚'
			
 
				+        elif re.search(cxjf_c, x2):
			
 
				+            return re.search(cxjf_c, x2).group(0), '诚信加分'
			
 
				+
			
 
				+        elif re.search(blxw_c, x2):
			
 
				+            return re.search(blxw_c, x2).group(0), '不良行为'
			
 
				+        elif re.search(other_c, x2):
			
 
				+            return re.search(other_c, x2).group(0), '其他不良行为'
			
 
				+
			
 
				+        return ' ', '未知类别'
			
 
				+
			
 
				+    # 处罚决定
			
 
				+    def get_punishDecision(self, x, x2):
			
 
				+        '''通过正则匹配文章内容中的处理决定
			
 
				+        x:正文内容
			
 
				+        x2: 处罚类别
			
 
				+        return 处理决定字符串'''
			
 
				+        rule1 = re.compile(
			
 
				+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处理意见|行政处罚|处罚)(如下|如下))'
			
 
				+            '|((以下|如下)(决定|处理|处理意见|行政处罚|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
			
 
				+            '|整改意见)[:：].{5,}')
			
 
				+        rule2 = re.compile(
			
 
				+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处罚|处理意见)(如下|如下))'
			
 
				+            '|((以下|如下)(决定|处理|处理意见|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
			
 
				+            '|处罚内容)[：，,].{10,}')
			
 
				+        rule3 = re.compile('考评结果：?.*')
			
 
				+        rule4 = re.compile('(依据|根据)《.*》.*')
			
 
				+        if x2 == '未知类别':
			
 
				+            return ' '
			
 
				+        elif re.search(rule1, x[-int(len(x)*0.4):]):
			
 
				+            return re.search(rule1, x[-int(len(x)*0.4):]).group(0)
			
 
				+        elif re.search(rule1, x[-int(len(x)*0.6):]):
			
 
				+            return re.search(rule1, x[-int(len(x)*0.6):]).group(0)
			
 
				+        elif re.search(rule2, x[-int(len(x)*0.7):]):
			
 
				+            return re.search(rule2, x[-int(len(x)*0.7):]).group(0)
			
 
				+        elif re.search(rule3, x[-int(len(x)*0.6):]):
			
 
				+            return re.search(rule3, x[-int(len(x)*0.6):]).group(0)
			
 
				+        elif re.search(rule4, x[-int(len(x)*0.4):]):
			
 
				+            return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
			
 
				+        else:
			
 
				+            return ' '
			
 
				+
			
 
				+    # 投诉是否成立
			
 
				+    def get_punishWhether(self, x1, x2, x3):
			
 
				+        '''通过正则匹配处理决定判断投诉是否成立
			
 
				+        x1: 处理决定字符串
			
 
				+        x2: 正文内容
			
 
				+        x3: 处罚类别
			
 
				+        return 投诉是否成立'''
			
 
				+        p1 = re.compile('(投诉|投拆|质疑|举报)(事项|内容|事实)?[^不，。]{,10}(成立|属实|予以受理|予以支持)|责令|废标|(中标|成交)[^，。]{,10}无效'
			
 
				+                        '|取消[^，。]{,60}资格|罚款|重新(组织|开展)?(招标|采购)|投诉成立|被投诉人存在违法违规行为'
			
 
				+                        '|采购活动违法|(中标|评标|成交)结果无效')
			
 
				+        p2 = re.compile('投诉不予[处受]理|((投诉|投拆|质疑|举报)(事项|内容|事实)?[^，。]{,10}(不成立|情?况?不属实|不予支持|缺乏事实依据))'
			
 
				+                        '|((驳回|撤回|撤销|终止)[^，。]*(投诉|质疑|诉求))|终止[^，。]{,20}(行政裁决|投诉处理|采购活动)|投诉终止|投诉无效'
			
 
				+                        '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^，。]{,10}不成立'
			
 
				+                        '|维持被投诉人|不支持[^，。]{,20}投诉|无确凿证据')
			
 
				+        if x3 != '投诉处理':
			
 
				+            return ' '
			
 
				+        elif re.search(p1, x1):
			
 
				+            return '投诉成立'
			
 
				+        elif re.search(p2, x1):
			
 
				+            return '投诉无效'
			
 
				+        elif re.search(p1, x2):
			
 
				+            return '投诉成立'
			
 
				+        elif re.search(p2, x2):
			
 
				+            return '投诉无效'
			
 
				+        return ' '
			
 
				+
			
 
				+    # 执法机构、处罚时间
			
 
				+    def get_institution(self, title, sentences_l, entity_l):
			
 
				+        '''
			
 
				+        通过判断实体前信息判断改实体是否为执法机构
			
 
				+        :param title: 文章标题
			
 
				+        :param sentences_l: 单篇公告句子列表
			
 
				+        :param entity_l: 单篇公告实体列表
			
 
				+        :return: 执法机构及处罚时间字符串，多个的用；号隔开
			
 
				+        '''
			
 
				+        institutions = []
			
 
				+        punishTimes = []
			
 
				+        institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[:：]")
			
 
				+        punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[:：]")
			
 
				+        # 通过实体前面关键词判断是否为执法机构或处罚时间
			
 
				+        for ner in entity_l:
			
 
				+            if ner.entity_type == 'org':
			
 
				+                left = sentences_l[ner.sentence_index].sentence_text[
			
 
				+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
			
 
				+                if institution_1.search(left):
			
 
				+                    institutions.append(ner)
			
 
				+                elif institutions != [] and ner.sentence_index == institutions[-1].sentence_index and \
			
 
				+                        ner.wordOffset_begin - institutions[-1].wordOffset_end < 2 and \
			
 
				+                        sentences_l[ner.sentence_index].sentence_text[
			
 
				+                        ner.wordOffset_begin:institutions[-1].wordOffset_end] \
			
 
				+                        in ['', '、', '和', '及']:
			
 
				+                    institutions.append(ner)
			
 
				+            elif ner.entity_type == 'time':
			
 
				+                left = sentences_l[ner.sentence_index].sentence_text[
			
 
				+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
			
 
				+                if punishTimes_1.search(left):
			
 
				+                    punishTimes.append(ner)
			
 
				+
			
 
				+        institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
			
 
				+        institution_time = re.compile(
			
 
				+            "(^，?[\d一二三四五六七八九十]{4}，?[/年-][\d一二三四五六七八九十]{1,2}，?[/月-][\d一二三四五六七八九十]{1,2}，?[/日-]?)")
			
 
				+        ins = ""
			
 
				+        ptime = ""
			
 
				+        # 如果前面步骤找不到处罚机构则在标题找实体，并正则检查是否有关键词
			
 
				+        if institutions == [] and len(title)>10:
			
 
				+            title_ners = getNers([title], useselffool=True)
			
 
				+            if title_ners[0]:
			
 
				+                for title_ner in title_ners[0]:
			
 
				+                    if title_ner[2] == 'org' and institution_title.search(title_ner[3]):
			
 
				+                        ins = title_ner[3]
			
 
				+                        break
			
 
				+        if punishTimes == [] or institutions == []:
			
 
				+            # 如果前面步骤还没找到要素，则通过公司实体后面是否有日期关键词，有则作为处罚机构和处罚时间
			
 
				+            for ner in [ner for ner in entity_l if ner.entity_type == 'org'][-5:][::-1]:
			
 
				+                right = sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_end:ner.wordOffset_end + 16]
			
 
				+                if institution_time.search(right):
			
 
				+                    if ins == '':
			
 
				+                        ins = ner.entity_text
			
 
				+                    if ptime == '':
			
 
				+                        ptime = institution_time.search(right).group(1)
			
 
				+                    break
			
 
				+            # 前面步骤都没找到则判断最后一个时间实体是否在文章末尾，是则作为处罚时间
			
 
				+            if ptime == '':
			
 
				+                n_time = [ner for ner in entity_l if ner.entity_type == 'time']
			
 
				+                if len(n_time) != 0:
			
 
				+                    ner = n_time[-1]
			
 
				+                    if ner.sentence_index == len(sentences_l) - 1:
			
 
				+                        textLong = len(sentences_l[ner.sentence_index].sentence_text)
			
 
				+                        if ner.wordOffset_end > textLong - 3 and len(ner.entity_text) > 3:
			
 
				+                            ptime = ner.entity_text
			
 
				+        institutions = [ner.entity_text for ner in institutions]
			
 
				+        punishTimes = [ner.entity_text for ner in punishTimes]
			
 
				+        if institutions == [] and ins != "":
			
 
				+            institutions.append(ins)
			
 
				+        if punishTimes == [] and ptime != "":
			
 
				+            punishTimes.append(ptime)
			
 
				+        return "；".join(institutions), "；".join(punishTimes)
			
 
				+
			
 
				+    # 投诉人、被投诉人、被处罚人
			
 
				+    def get_complainant(self, punishType, sentences_l, entity_l):
			
 
				+        '''
			
 
				+        通过对公告类别、句子列表、实体列表正则寻找投诉人、被投诉人、处罚人
			
 
				+        :param punishType: 公告处罚类别
			
 
				+        :param sentences_l: 单篇公告句子列表
			
 
				+        :param entity_l: 单篇公告实体列表
			
 
				+        :return: 投诉人、被投诉人
			
 
				+        '''
			
 
				+        complainants = []  # 投诉人
			
 
				+        punishPeople = []  # 被投诉人、被处罚人
			
 
				+        size = 16
			
 
				+        # 投诉人、质疑人
			
 
				+        complainants_rule1 = re.compile(
			
 
				+            "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
			
 
				+        # 被处罚人，被投诉人
			
 
				+        punishPeople_rule1 = re.compile(
			
 
				+            "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
			
 
				+        punishPeople_rule2_1 = re.compile("，$")
			
 
				+        punishPeople_rule2_2 = re.compile("^[:：]")
			
 
				+        punishPeople_rule3_1 = re.compile("(?:关于|对)[^，。]*$")
			
 
				+        punishPeople_rule3_2 = re.compile("^[^，。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
			
 
				+
			
 
				+        punish_l = []  # 处罚实体列表
			
 
				+        tmp = []
			
 
				+        for ner in [ner for ner in entity_l if ner.entity_type in ['org', 'company', 'person']]:
			
 
				+            if tmp == []:
			
 
				+                tmp.append(ner)
			
 
				+            elif ner.entity_type == tmp[-1].entity_type and ner.sentence_index == tmp[-1].sentence_index and \
			
 
				+                    ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
			
 
				+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
			
 
				+                '',
			
 
				+                '、',
			
 
				+                '和',
			
 
				+                '及']:
			
 
				+                tmp.append(ner)
			
 
				+            elif ner.entity_type in ['org', 'company'] and tmp[-1].entity_type in ['org', 'company'] and \
			
 
				+                    ner.sentence_index == tmp[-1].sentence_index and ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
			
 
				+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
			
 
				+                '',
			
 
				+                '、',
			
 
				+                '和',
			
 
				+                '及']:
			
 
				+                tmp.append(ner)
			
 
				+            else:
			
 
				+                punish_l.append(tmp)
			
 
				+                tmp = [ner]
			
 
				+        for ner_l in punish_l:
			
 
				+            begin_index = ner_l[0].wordOffset_begin
			
 
				+            end_index = ner_l[-1].wordOffset_end
			
 
				+            left = sentences_l[ner_l[0].sentence_index].sentence_text[max(0, begin_index - size):begin_index]
			
 
				+            right = sentences_l[ner_l[0].sentence_index].sentence_text[end_index:end_index + size]
			
 
				+            if complainants_rule1.search(left):
			
 
				+                complainants.append(ner_l)
			
 
				+            elif punishPeople_rule1.search(left):
			
 
				+                punishPeople.append(ner_l)
			
 
				+            elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
			
 
				+                if punishType == '投诉处理':
			
 
				+                    complainants.append(ner_l)
			
 
				+                else:
			
 
				+                    punishPeople.append(ner_l)
			
 
				+            elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
			
 
				+                punishPeople.append(ner_l)
			
 
				+        complainants = set([it.entity_text for l in complainants for it in l])
			
 
				+        punishPeople = set([it.entity_text for l in punishPeople for it in l])
			
 
				+        return '；'.join(complainants), '；'.join(punishPeople)
			
 
				+
			
 
				+    def get_punish_extracts(self,list_articles,list_sentences, list_entitys):
			
 
				+        list_result = []
			
 
				+        for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
			
 
				+            title = article.title
			
 
				+            text=article.content
			
 
				+            keyword, punishType = self.get_punishType(title, text)
			
 
				+
			
 
				+            # print('处罚类型：',punishType)
			
 
				+            punish_code = self.predict_punishCode(list_sentences)
			
 
				+            # print('处罚编号： ',punish_code)
			
 
				+            institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
			
 
				+            # print('执法机构:',institutions, '\n 处罚时间：', punishTimes)
			
 
				+            punishDecision = self.get_punishDecision(text, punishType)
			
 
				+            # print('处罚决定：',punishDecision)
			
 
				+            punishWhether= self.get_punishWhether(punishDecision, text, punishType)
			
 
				+            # print('投诉是否成立：',punishWhether)
			
 
				+            complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
			
 
				+            # print('投诉人：%s  被投诉人：%s'%(complainants, punishPeople))
			
 
				+            punish_dic = {'punish_code':punish_code,
			
 
				+                          'punishType':punishType,
			
 
				+                          'punishDecision':punishDecision,
			
 
				+                         'complainants':complainants,
			
 
				+                         'punishPeople':punishPeople,
			
 
				+                         'punishWhether':punishWhether,
			
 
				+                         'institutions':institutions,
			
 
				+                         'punishTimes':punishTimes}
			
 
				+            _count = 0
			
 
				+            for k,v in punish_dic.items():
			
 
				+                if v!="":
			
 
				+                    _count += 1
			
 
				+            if _count>=2 and punish_dic["punishType"]!="未知类别":
			
 
				+                list_result.append({"punish":punish_dic})
			
 
				+            else:
			
 
				+                list_result.append({"punish":{}})
			
 
				+        return list_result
			
 
				+
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    punish = Punish_Extract()
			
 
				+
			
 
				+    import pandas as pd
			
 
				+    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
			
 
				+    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
			
 
				+    # i = 89
			
 
				+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
			
 
				+    # i = 92
			
 
				+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
			
 
				+
			
 
				+    # t1 = time.time()
			
 
				+    # for i in df.index:
			
 
				+    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
			
 
				+    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
			
 
				+    #     df.loc[i, '投诉人'] = complainants
			
 
				+    #     df.loc[i, '被投诉人'] = punishPeople
			
 
				+    #     df.loc[i, '执法机构'] = institutions
			
 
				+    #     df.loc[i, '处罚时间'] = punishTimes
			
 
				+    #     df.loc[i, '处罚编号'] = punish_code
			
 
				+    #     print('完成第%d篇'%i)
			
 
				+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
			
 
				+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
			
 
				+    # #    'institution', 'punishTime', 'ner_test']])
			
 
				+    # t2 = time.time()
			
 
				+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
			
 
				+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
			
 
				+    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
			
 
				+    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				+    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
			
 
				+    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
			
 
				+    # t3 = time.time()
			
 
				+    # print('处理耗时：%.4f, 保存耗时：%.4f'%(t2-t1, t3-t2))
			
 
				+    s = '编号：厦财企〔2020〕12号，各有关单位：341号。处罚编号：厦财企〔2020〕12号，文章编号：京财采投字(2018)第42号。公告编号：闽建筑招〔2018〕5号。处罚编号：松公管监[2020]2号,'
			
 
				+    # list_sentences = [s.split('。')]
			
 
				+    # punish_code= punish.predict_punishCode( list_sentences)
			
 
				+    # print(punish_code)
			
 
				+
			
 
				+    # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
			
 
				+    #             get_punish_extracts(text=s)
			
 
				+    # punish_dic = punish.get_punish_extracts(text=s)
			
 
				+    # print(punish_dic)
			
--- a/iepy/selfpreprocess/BiddingKG/dl/complaint/punish_rule.py
+++ b/iepy/selfpreprocess/BiddingKG/dl/complaint/punish_rule.py
@@ -75,6 +75,7 @@ def BiLSTM_CRF_tfmodel(sess,weights):
 
				             grads_vars = opt.compute_gradients(crf_loss)
			
 
				             capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
			
 
				             train_op = opt.apply_gradients(capped_grads_vars,global_step)
			
 
				+            print('tensor: ',char_input, length, trans, _logits)
			
 
				             return char_input,_logits,target,length,crf_loss,trans,train_op
			
 
				 
			
 
				 def decode(logits, trans, sequence_lengths, tag_num):
			
@@ -125,6 +126,7 @@ class Punish_Extract():
 
				                         sentences_x.append(sentence2id)
			
 
				                     sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
			
 
				                     sentences_x = [np.array(x) for x in sentences_x]
			
 
				+                    print('punish tensor: ',self.logits, self.trans, self.char_input, self.length)
			
 
				                     _logits, _trans = self.sess.run([self.logits, self.trans],
			
 
				                                                feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
			
 
				                     viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
			
@@ -143,7 +145,7 @@ class Punish_Extract():
 
				                                 # ner_list.append((n, start, end))
			
 
				                                 ner_list.append(n)  # 改为只返回实体字符
			
 
				                     # article_ner_list.append(ner_list)
			
 
				-                    article_ner_list.append('；'.join(set(ner_list)))
			
 
				+                    article_ner_list.append(';'.join(set(ner_list)))
			
 
				         return article_ner_list[0]
			
 
				 
			
 
				     # 处罚类型
			
@@ -261,7 +263,7 @@ class Punish_Extract():
 
				         elif re.search(rule4, x[-int(len(x)*0.4):]):
			
 
				             return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
			
 
				         else:
			
 
				-            return ' '
			
 
				+            return ''
			
 
				 
			
 
				     # 投诉是否成立
			
 
				     def get_punishWhether(self, x1, x2, x3):
			
@@ -278,7 +280,7 @@ class Punish_Extract():
 
				                         '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^，。]{,10}不成立'
			
 
				                         '|维持被投诉人|不支持[^，。]{,20}投诉|无确凿证据')
			
 
				         if x3 != '投诉处理':
			
 
				-            return ' '
			
 
				+            return ''
			
 
				         elif re.search(p1, x1):
			
 
				             return '投诉成立'
			
 
				         elif re.search(p2, x1):
			
@@ -287,7 +289,7 @@ class Punish_Extract():
 
				             return '投诉成立'
			
 
				         elif re.search(p2, x2):
			
 
				             return '投诉无效'
			
 
				-        return ' '
			
 
				+        return ''
			
 
				 
			
 
				     # 执法机构、处罚时间
			
 
				     def get_institution(self, title, sentences_l, entity_l):
			
@@ -296,7 +298,7 @@ class Punish_Extract():
 
				         :param title: 文章标题
			
 
				         :param sentences_l: 单篇公告句子列表
			
 
				         :param entity_l: 单篇公告实体列表
			
 
				-        :return: 执法机构及处罚时间字符串，多个的用；号隔开
			
 
				+        :return: 执法机构及处罚时间字符串，多个的用;号隔开
			
 
				         '''
			
 
				         institutions = []
			
 
				         punishTimes = []
			
@@ -359,7 +361,7 @@ class Punish_Extract():
 
				             institutions.append(ins)
			
 
				         if punishTimes == [] and ptime != "":
			
 
				             punishTimes.append(ptime)
			
 
				-        return "；".join(institutions), "；".join(punishTimes)
			
 
				+        return ";".join(institutions), ";".join(punishTimes)
			
 
				 
			
 
				     # 投诉人、被投诉人、被处罚人
			
 
				     def get_complainant(self, punishType, sentences_l, entity_l):
			
@@ -426,7 +428,7 @@ class Punish_Extract():
 
				                 punishPeople.append(ner_l)
			
 
				         complainants = set([it.entity_text for l in complainants for it in l])
			
 
				         punishPeople = set([it.entity_text for l in punishPeople for it in l])
			
 
				-        return '；'.join(complainants), '；'.join(punishPeople)
			
 
				+        return ';'.join(complainants), ';'.join(punishPeople)
			
 
				 
			
 
				     def get_punish_extracts_backup(self, doc_id=' ', title=' ', text=' '):
			
 
				         list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
			
@@ -459,73 +461,99 @@ class Punish_Extract():
 
				         for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
			
 
				             title = article.title
			
 
				             text=article.content
			
 
				+
			
 
				             keyword, punishType = self.get_punishType(title, text)
			
 
				-            if punishType == "未知类别":
			
 
				-                list_result.append({"punish":{}})
			
 
				-            else:
			
 
				-                # print('处罚类型：',punishType)
			
 
				-                punish_code = self.predict_punishCode(list_sentences)
			
 
				-                # print('处罚编号： ',punish_code)
			
 
				-                institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
			
 
				-                # print('执法机构:',institutions, '\n 处罚时间：', punishTimes)
			
 
				-                punishDecision = self.get_punishDecision(text, punishType)
			
 
				-                # print('处罚决定：',punishDecision)
			
 
				-                punishWhether= self.get_punishWhether(punishDecision, text, punishType)
			
 
				-                # print('投诉是否成立：',punishWhether)
			
 
				-                complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
			
 
				-                # print('投诉人：%s  被投诉人：%s'%(complainants, punishPeople))
			
 
				-                punish_dic = {'punish_code':punish_code,
			
 
				-                              'punishType':punishType,
			
 
				-                              'punishDecision':punishDecision,
			
 
				-                             'complainants':complainants,
			
 
				-                             'punishPeople':punishPeople,
			
 
				-                             'punishWhether':punishWhether,
			
 
				-                             'institutions':institutions,
			
 
				-                             'punishTimes':punishTimes}
			
 
				+            # print('处罚类型：',punishType)
			
 
				+            punish_code = self.predict_punishCode(list_sentences)
			
 
				+            # print('处罚编号： ',punish_code)
			
 
				+            institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
			
 
				+            # print('执法机构:',institutions, '\n 处罚时间：', punishTimes)
			
 
				+            punishDecision = self.get_punishDecision(text, punishType)
			
 
				+            # print('处罚决定：',punishDecision)
			
 
				+            punishWhether= self.get_punishWhether(punishDecision, text, punishType)
			
 
				+            # print('投诉是否成立：',punishWhether)
			
 
				+            complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
			
 
				+            # print('投诉人：%s  被投诉人：%s'%(complainants, punishPeople))
			
 
				+            punish_dic = {'punish_code':punish_code,
			
 
				+                          'punishType':punishType,
			
 
				+                          'punishDecision':punishDecision,
			
 
				+                         'complainants':complainants,
			
 
				+                         'punishPeople':punishPeople,
			
 
				+                         'punishWhether':punishWhether,
			
 
				+                         'institutions':institutions,
			
 
				+                         'punishTimes':punishTimes}
			
 
				+            _count = 0
			
 
				+            for k,v in punish_dic.items():
			
 
				+                if v!="":
			
 
				+                    _count += 1
			
 
				+            if _count>=2 and punish_dic["punishType"]!="未知类别":
			
 
				                 list_result.append({"punish":punish_dic})
			
 
				+            else:
			
 
				+                list_result.append({"punish":{}})
			
 
				         return list_result
			
 
				 
			
 
				-if __name__ == "__main__":
			
 
				-    punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
			
 
				+def save_punish_code_model():
			
 
				+    model_folder = os.path.dirname(__file__) + "/models/21-0.9990081295021194-0.3647936"
			
 
				+    output_graph = os.path.dirname(__file__) + "/models/punish_code.pb"
			
 
				+    ckpt = tf.train.get_checkpoint_state(model_folder)
			
 
				+    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
			
 
				+        input_checkpoint = ckpt.model_checkpoint_path
			
 
				+        saver = tf.train.import_meta_graph(input_checkpoint+".meta", clear_devices=True)
			
 
				+        graph = tf.get_default_graph()
			
 
				+        input_graph_def = graph.as_graph_def()
			
 
				+        with tf.Session() as sess:
			
 
				+            saver.restore(sess, input_checkpoint)
			
 
				+            output_graph_def = graph_util.convert_variables_to_constants(
			
 
				+                sess = sess,
			
 
				+                input_graph_def = input_graph_def,
			
 
				+                output_node_names=["char_input","length","crf_loss/transitons","CRF/output/logits"]
			
 
				+            )
			
 
				+            with tf.gfile.GFile(output_graph, "wb") as f:
			
 
				+                f.write(output_graph_def.SerializeToString())
			
 
				 
			
 
				-    import pandas as pd
			
 
				-    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
			
 
				-    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
			
 
				-    # i = 89
			
 
				-    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
			
 
				-    # i = 92
			
 
				-    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
			
 
				 
			
 
				-    # t1 = time.time()
			
 
				-    # for i in df.index:
			
 
				-    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
			
 
				-    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
			
 
				-    #     df.loc[i, '投诉人'] = complainants
			
 
				-    #     df.loc[i, '被投诉人'] = punishPeople
			
 
				-    #     df.loc[i, '执法机构'] = institutions
			
 
				-    #     df.loc[i, '处罚时间'] = punishTimes
			
 
				-    #     df.loc[i, '处罚编号'] = punish_code
			
 
				-    #     print('完成第%d篇'%i)
			
 
				-    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				-    # #     '关键词', '类别', '处理决定', '投诉是否成立',
			
 
				-    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
			
 
				-    # #    'institution', 'punishTime', 'ner_test']])
			
 
				-    # t2 = time.time()
			
 
				+if __name__ == "__main__":
			
 
				+    save_punish_code_model()
			
 
				+    # punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
			
 
				+    #
			
 
				+    # import pandas as pd
			
 
				+    # # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
			
 
				+    # df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
			
 
				+    # # i = 89
			
 
				+    # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
			
 
				+    # # i = 92
			
 
				+    # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
			
 
				+    #
			
 
				+    # # t1 = time.time()
			
 
				+    # # for i in df.index:
			
 
				+    # #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
			
 
				+    # #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
			
 
				+    # #     df.loc[i, '投诉人'] = complainants
			
 
				+    # #     df.loc[i, '被投诉人'] = punishPeople
			
 
				+    # #     df.loc[i, '执法机构'] = institutions
			
 
				+    # #     df.loc[i, '处罚时间'] = punishTimes
			
 
				+    # #     df.loc[i, '处罚编号'] = punish_code
			
 
				+    # #     print('完成第%d篇'%i)
			
 
				+    # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				+    # # #     '关键词', '类别', '处理决定', '投诉是否成立',
			
 
				+    # # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
			
 
				+    # # #    'institution', 'punishTime', 'ner_test']])
			
 
				+    # # t2 = time.time()
			
 
				+    # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				+    # # #     '关键词', '类别', '处理决定', '投诉是否成立',
			
 
				+    # # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
			
 
				+    # # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
			
 
				     # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				-    # #     '关键词', '类别', '处理决定', '投诉是否成立',
			
 
				-    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
			
 
				-    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
			
 
				-    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				-    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
			
 
				-    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
			
 
				-    # t3 = time.time()
			
 
				-    # print('处理耗时：%.4f, 保存耗时：%.4f'%(t2-t1, t3-t2))
			
 
				-    s = '编号：厦财企〔2020〕12号，各有关单位：341号。处罚编号：厦财企〔2020〕12号，文章编号：京财采投字(2018)第42号。公告编号：闽建筑招〔2018〕5号。处罚编号：松公管监[2020]2号,'
			
 
				-    # list_sentences = [s.split('。')]
			
 
				-    # punish_code= punish.predict_punishCode( list_sentences)
			
 
				-    # print(punish_code)
			
 
				-
			
 
				-    # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
			
 
				-    #             get_punish_extracts(text=s)
			
 
				-    punish_dic = punish.get_punish_extracts_backup(text=s)
			
 
				-    print(punish_dic)
			
 
				+    # #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
			
 
				+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
			
 
				+    # # t3 = time.time()
			
 
				+    # # print('处理耗时：%.4f, 保存耗时：%.4f'%(t2-t1, t3-t2))
			
 
				+    # s = '编号：厦财企〔2020〕12号，各有关单位：341号。处罚编号：厦财企〔2020〕12号，文章编号：京财采投字(2018)第42号。公告编号：闽建筑招〔2018〕5号。处罚编号：松公管监[2020]2号,'
			
 
				+    # # list_sentences = [s.split('。')]
			
 
				+    # # punish_code= punish.predict_punishCode( list_sentences)
			
 
				+    # # print(punish_code)
			
 
				+    #
			
 
				+    # # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
			
 
				+    # #             get_punish_extracts(text=s)
			
 
				+    # punish_dic = punish.get_punish_extracts_backup(text=s)
			
 
				+    # print(punish_dic)
			
--- a/iepy/selfpreprocess/BiddingKG/dl/interface/Preprocessing.py
+++ b/iepy/selfpreprocess/BiddingKG/dl/interface/Preprocessing.py
@@ -107,7 +107,7 @@ def tableToText(soup):
 
				             tr_line = []
			
 
				             tds = tr.findChildren(['td','th'], recursive=False)
			
 
				             for td in tds:
			
 
				-                tr_line.append([re.sub('\xa0','',segment(td)),0])
			
 
				+                tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
			
 
				                 #tr_line.append([td.get_text(),0])
			
 
				             inner_table.append(tr_line)
			
 
				         return inner_table                          
			
@@ -419,10 +419,10 @@ def tableToText(soup):
 
				                 inner_table[_h][_w][1] = 0
			
 
				 
			
 
				 
			
 
				-        print("=====")
			
 
				-        for item in inner_table:
			
 
				-            print(item)
			
 
				-        print("======")
			
 
				+        # print("=====")
			
 
				+        # for item in inner_table:
			
 
				+        #     print(item)
			
 
				+        # print("======")
			
 
				 
			
 
				         repairTable(inner_table)
			
 
				         head_list = sliceTable(inner_table)
			
@@ -628,7 +628,7 @@ def tableToText(soup):
 
				         # packPattern = "(标包|[标包][号段名])"
			
 
				         packPattern = "(标包|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则，补充采购类包名
			
 
				         rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标)"  # 2020/11/23 大网站规则，添加序号为排序
			
 
				-        entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
			
 
				+        entityPattern = "(候选|([中投]标|报价)|单位名称|供应商|金额)"
			
 
				         height = len(inner_table)
			
 
				         width = len(inner_table[0])
			
 
				         text = ""
			
@@ -640,11 +640,7 @@ def tableToText(soup):
 
				                 
			
 
				             direct = getDirect(inner_table, head_begin, head_end)
			
 
				 
			
 
				-            print("----")
			
 
				-            print(inner_table[head_begin:head_end])
			
 
				-            print("head_end-head_begin",head_end-head_begin)
			
 
				-            print(direct)
			
 
				-            
			
 
				+
			
 
				             #若只有一行，则直接按行读取
			
 
				             if head_end-head_begin==1:
			
 
				                 text_line = ""
			
@@ -671,38 +667,34 @@ def tableToText(soup):
 
				                         line_oc.append({"text":cell[0],"type":cell[1],"occu_count":0,"left_head":"","top_head":""})
			
 
				                     table_occurence.append(line_oc)
			
 
				 
			
 
				+
			
 
				+                occu_height = len(table_occurence)
			
 
				+                occu_width = len(table_occurence[0]) if len(table_occurence)>0 else 0
			
 
				                 #为每个属性值寻找表头
			
 
				-                for i in range(head_begin,head_end):
			
 
				-                    pack_text = ""
			
 
				-                    rank_text = ""
			
 
				-                    entity_text = ""
			
 
				-                    text_line = ""
			
 
				-                    #在同一句话中重复的可以去掉
			
 
				-                    text_set = set()
			
 
				-                    for j in range(width):
			
 
				+                for i in range(occu_height):
			
 
				+                    for j in range(occu_width):
			
 
				                         cell = table_occurence[i][j]
			
 
				                         #是属性值
			
 
				                         if cell["type"]==0 and cell["text"]!="":
			
 
				                             left_head = ""
			
 
				                             top_head = ""
			
 
				 
			
 
				-                            head = ""
			
 
				-
			
 
				                             find_flag = False
			
 
				                             temp_head = ""
			
 
				-                            for loop_i in range(0,i+1-head_begin):
			
 
				+                            for loop_i in range(1,i+1):
			
 
				                                 if not key_direct:
			
 
				                                     key_values = [1,2]
			
 
				                                 else:
			
 
				                                     key_values = [1]
			
 
				                                 if table_occurence[i-loop_i][j]["type"] in key_values:
			
 
				                                     if find_flag:
			
 
				-                                        if table_occurence[i-loop_i][j][0]!=temp_head:
			
 
				+                                        if table_occurence[i-loop_i][j]["text"]!=temp_head:
			
 
				                                             top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
			
 
				                                     else:
			
 
				                                         top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
			
 
				                                     find_flag = True
			
 
				                                     temp_head = table_occurence[i-loop_i][j]["text"]
			
 
				+                                    table_occurence[i-loop_i][j]["occu_count"] += 1
			
 
				                                 else:
			
 
				                                     #找到表头后遇到属性值就返回
			
 
				                                     if find_flag:
			
@@ -720,38 +712,21 @@ def tableToText(soup):
 
				                                     key_values = [1,2]
			
 
				                                 else:
			
 
				                                     key_values = [2]
			
 
				-                                if inner_table[i][j-loop_j][1] in key_values:
			
 
				+                                if table_occurence[i][j-loop_j]["type"] in key_values:
			
 
				                                     if find_flag:
			
 
				-                                        if inner_table[i][j-loop_j][0]!=temp_head:
			
 
				-                                            head = inner_table[i][j-loop_j][0]+":"+head
			
 
				+                                        if table_occurence[i][j-loop_j]["text"]!=temp_head:
			
 
				+                                            left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
			
 
				                                     else:
			
 
				-                                        head = inner_table[i][j-loop_j][0]+":"+head
			
 
				+                                        left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
			
 
				                                     find_flag = True
			
 
				-                                    temp_head = inner_table[i][j-loop_j][0]
			
 
				+                                    temp_head = table_occurence[i][j-loop_j]["text"]
			
 
				+                                    table_occurence[i][j-loop_j]["occu_count"] += 1
			
 
				                                 else:
			
 
				                                     if find_flag:
			
 
				                                         break
			
 
				-
			
 
				-                            if str(head+inner_table[i][j][0]) in text_set:
			
 
				-                                continue
			
 
				-                            if re.search(packPattern,head) is not None:
			
 
				-                                pack_text += head+inner_table[i][j][0]+"，"
			
 
				-                            elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题，if 改elif
			
 
				-                                #排名替换为同一种表达
			
 
				-                                rank_text += head+inner_table[i][j][0]+"，"
			
 
				-                                #print(rank_text)
			
 
				-                            elif re.search(entityPattern,head) is not None:
			
 
				-                                entity_text += head+inner_table[i][j][0]+"，"
			
 
				-                                #print(entity_text)
			
 
				-                            else:
			
 
				-                                text_line += head+inner_table[i][j][0]+"，"
			
 
				-                            text_set.add(str(head+inner_table[i][j][0]))
			
 
				-                    text += pack_text+rank_text+entity_text+text_line
			
 
				-                    text = text[:-1]+"。" if len(text)>0 else text
			
 
				-
			
 
				-
			
 
				+                            cell["left_head"] += left_head
			
 
				                 if direct=="row":
			
 
				-                    for i in range(head_begin,head_end):
			
 
				+                    for i in range(occu_height):
			
 
				                         pack_text = ""
			
 
				                         rank_text = ""
			
 
				                         entity_text = ""
			
@@ -759,131 +734,196 @@ def tableToText(soup):
 
				                         #在同一句话中重复的可以去掉
			
 
				                         text_set = set()
			
 
				                         for j in range(width):
			
 
				-                            cell = inner_table[i][j]
			
 
				-                            #是属性值
			
 
				-                            if cell[1]==0 and cell[0]!="":
			
 
				-                                head = ""
			
 
				-                                
			
 
				-                                find_flag = False
			
 
				-                                temp_head = ""
			
 
				-                                for loop_i in range(0,i+1-head_begin):
			
 
				-                                    if not key_direct:
			
 
				-                                        key_values = [1,2]
			
 
				-                                    else:
			
 
				-                                        key_values = [1]
			
 
				-                                    if inner_table[i-loop_i][j][1] in key_values:
			
 
				-                                        if find_flag:
			
 
				-                                            if inner_table[i-loop_i][j][0]!=temp_head:
			
 
				-                                                head = inner_table[i-loop_i][j][0]+":"+head
			
 
				-                                        else:
			
 
				-                                            head = inner_table[i-loop_i][j][0]+":"+head
			
 
				-                                        find_flag = True
			
 
				-                                        temp_head = inner_table[i-loop_i][j][0]
			
 
				-                                    else:
			
 
				-                                        #找到表头后遇到属性值就返回
			
 
				-                                        if find_flag:
			
 
				-                                            break
			
 
				+                            cell = table_occurence[i][j]
			
 
				+                            if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
			
 
				 
			
 
				-                                find_flag = False
			
 
				-                                temp_head = ""
			
 
				-
			
 
				-
			
 
				-                                
			
 
				-                                for loop_j in range(1,j+1):
			
 
				-                                    if not key_direct:
			
 
				-                                        key_values = [1,2]
			
 
				-                                    else:
			
 
				-                                        key_values = [2]
			
 
				-                                    if inner_table[i][j-loop_j][1] in key_values:
			
 
				-                                        if find_flag:
			
 
				-                                            if inner_table[i][j-loop_j][0]!=temp_head:
			
 
				-                                                head = inner_table[i][j-loop_j][0]+":"+head
			
 
				-                                        else:
			
 
				-                                            head = inner_table[i][j-loop_j][0]+":"+head
			
 
				-                                        find_flag = True
			
 
				-                                        temp_head = inner_table[i][j-loop_j][0]
			
 
				-                                    else:
			
 
				-                                        if find_flag:
			
 
				-                                            break
			
 
				-                                
			
 
				-                                if str(head+inner_table[i][j][0]) in text_set:
			
 
				+                                cell = table_occurence[i][j]
			
 
				+                                head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
			
 
				+                                head += cell["left_head"]
			
 
				+                                if str(head+cell["text"]) in text_set:
			
 
				                                     continue
			
 
				                                 if re.search(packPattern,head) is not None:
			
 
				-                                    pack_text += head+inner_table[i][j][0]+"，"
			
 
				+                                    pack_text += head+cell["text"]+"，"
			
 
				                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题，if 改elif
			
 
				                                     #排名替换为同一种表达
			
 
				-                                    rank_text += head+inner_table[i][j][0]+"，"
			
 
				+                                    rank_text += head+cell["text"]+"，"
			
 
				                                     #print(rank_text)
			
 
				                                 elif re.search(entityPattern,head) is not None:
			
 
				-                                    entity_text += head+inner_table[i][j][0]+"，"
			
 
				+                                    entity_text += head+cell["text"]+"，"
			
 
				                                     #print(entity_text)
			
 
				                                 else:
			
 
				-                                    text_line += head+inner_table[i][j][0]+"，"
			
 
				-                                text_set.add(str(head+inner_table[i][j][0]))
			
 
				+                                    text_line += head+cell["text"]+"，"
			
 
				+                                text_set.add(str(head+cell["text"]))
			
 
				+
			
 
				                         text += pack_text+rank_text+entity_text+text_line
			
 
				                         text = text[:-1]+"。" if len(text)>0 else text
			
 
				+
			
 
				                 else:
			
 
				-                    for j in range(width):
			
 
				-                    
			
 
				+                    for j in range(occu_width):
			
 
				+                        pack_text = ""
			
 
				                         rank_text = ""
			
 
				                         entity_text = ""
			
 
				                         text_line = ""
			
 
				                         text_set = set()
			
 
				-                        for i in range(head_begin,head_end):
			
 
				-                            cell = inner_table[i][j]
			
 
				-                            #是属性值
			
 
				-                            if cell[1]==0 and cell[0]!="":
			
 
				-                                find_flag = False
			
 
				-                                head = ""
			
 
				-                                temp_head = ""
			
 
				-                                
			
 
				-                                for loop_j in range(1,j+1):
			
 
				-                                    if not key_direct:
			
 
				-                                        key_values = [1,2]
			
 
				-                                    else:
			
 
				-                                        key_values = [2]
			
 
				-                                    if inner_table[i][j-loop_j][1] in key_values:
			
 
				-                                        if find_flag:
			
 
				-                                            if inner_table[i][j-loop_j][0]!=temp_head:
			
 
				-                                                head = inner_table[i][j-loop_j][0]+":"+head
			
 
				-                                        else:
			
 
				-                                            head = inner_table[i][j-loop_j][0]+":"+head
			
 
				-                                        find_flag = True
			
 
				-                                        temp_head = inner_table[i][j-loop_j][0]
			
 
				-                                    else:
			
 
				-                                        if find_flag:
			
 
				-                                            break
			
 
				-                                find_flag = False
			
 
				-                                temp_head = ""
			
 
				-                                for loop_i in range(0,i+1-head_begin):
			
 
				-                                    if not key_direct:
			
 
				-                                        key_values = [1,2]
			
 
				-                                    else:
			
 
				-                                        key_values = [1]
			
 
				-                                    if inner_table[i-loop_i][j][1] in key_values:
			
 
				-                                        if find_flag:
			
 
				-                                            if inner_table[i-loop_i][j][0]!=temp_head:
			
 
				-                                                head = inner_table[i-loop_i][j][0]+":"+head
			
 
				-                                        else:
			
 
				-                                            head = inner_table[i-loop_i][j][0]+":"+head
			
 
				-                                        find_flag = True
			
 
				-                                        temp_head = inner_table[i-loop_i][j][0]
			
 
				-                                    else:
			
 
				-                                        if find_flag:
			
 
				-                                            break
			
 
				-                                if str(head+inner_table[i][j][0]) in text_set:
			
 
				+                        for i in range(occu_height):
			
 
				+                            cell = table_occurence[i][j]
			
 
				+                            if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
			
 
				+
			
 
				+                                cell = table_occurence[i][j]
			
 
				+                                head = (cell["left_head"]+"") if len(cell["left_head"])>0 else ""
			
 
				+                                head += cell["top_head"]
			
 
				+                                if str(head+cell["text"]) in text_set:
			
 
				                                     continue
			
 
				-                                if re.search(rankPattern,head) is not None:
			
 
				-                                    rank_text += head+inner_table[i][j][0]+"，"
			
 
				+                                if re.search(packPattern,head) is not None:
			
 
				+                                    pack_text += head+cell["text"]+"，"
			
 
				+                                elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题，if 改elif
			
 
				+                                    #排名替换为同一种表达
			
 
				+                                    rank_text += head+cell["text"]+"，"
			
 
				                                     #print(rank_text)
			
 
				                                 elif re.search(entityPattern,head) is not None:
			
 
				-                                    entity_text += head+inner_table[i][j][0]+"，"
			
 
				+                                    entity_text += head+cell["text"]+"，"
			
 
				                                     #print(entity_text)
			
 
				                                 else:
			
 
				-                                    text_line += head+inner_table[i][j][0]+"，"
			
 
				-                                text_set.add(str(head+inner_table[i][j][0]))
			
 
				-                        text += rank_text+entity_text+text_line
			
 
				+                                    text_line += head+cell["text"]+"，"
			
 
				+                                text_set.add(str(head+cell["text"]))
			
 
				+                        text += pack_text+rank_text+entity_text+text_line
			
 
				                         text = text[:-1]+"。" if len(text)>0 else text
			
 
				+
			
 
				+
			
 
				+                # if direct=="row":
			
 
				+                #     for i in range(head_begin,head_end):
			
 
				+                #         pack_text = ""
			
 
				+                #         rank_text = ""
			
 
				+                #         entity_text = ""
			
 
				+                #         text_line = ""
			
 
				+                #         #在同一句话中重复的可以去掉
			
 
				+                #         text_set = set()
			
 
				+                #         for j in range(width):
			
 
				+                #             cell = inner_table[i][j]
			
 
				+                #             #是属性值
			
 
				+                #             if cell[1]==0 and cell[0]!="":
			
 
				+                #                 head = ""
			
 
				+                #
			
 
				+                #                 find_flag = False
			
 
				+                #                 temp_head = ""
			
 
				+                #                 for loop_i in range(0,i+1-head_begin):
			
 
				+                #                     if not key_direct:
			
 
				+                #                         key_values = [1,2]
			
 
				+                #                     else:
			
 
				+                #                         key_values = [1]
			
 
				+                #                     if inner_table[i-loop_i][j][1] in key_values:
			
 
				+                #                         if find_flag:
			
 
				+                #                             if inner_table[i-loop_i][j][0]!=temp_head:
			
 
				+                #                                 head = inner_table[i-loop_i][j][0]+":"+head
			
 
				+                #                         else:
			
 
				+                #                             head = inner_table[i-loop_i][j][0]+":"+head
			
 
				+                #                         find_flag = True
			
 
				+                #                         temp_head = inner_table[i-loop_i][j][0]
			
 
				+                #                     else:
			
 
				+                #                         #找到表头后遇到属性值就返回
			
 
				+                #                         if find_flag:
			
 
				+                #                             break
			
 
				+                #
			
 
				+                #                 find_flag = False
			
 
				+                #                 temp_head = ""
			
 
				+                #
			
 
				+                #
			
 
				+                #
			
 
				+                #                 for loop_j in range(1,j+1):
			
 
				+                #                     if not key_direct:
			
 
				+                #                         key_values = [1,2]
			
 
				+                #                     else:
			
 
				+                #                         key_values = [2]
			
 
				+                #                     if inner_table[i][j-loop_j][1] in key_values:
			
 
				+                #                         if find_flag:
			
 
				+                #                             if inner_table[i][j-loop_j][0]!=temp_head:
			
 
				+                #                                 head = inner_table[i][j-loop_j][0]+":"+head
			
 
				+                #                         else:
			
 
				+                #                             head = inner_table[i][j-loop_j][0]+":"+head
			
 
				+                #                         find_flag = True
			
 
				+                #                         temp_head = inner_table[i][j-loop_j][0]
			
 
				+                #                     else:
			
 
				+                #                         if find_flag:
			
 
				+                #                             break
			
 
				+                #
			
 
				+                #                 if str(head+inner_table[i][j][0]) in text_set:
			
 
				+                #                     continue
			
 
				+                #                 if re.search(packPattern,head) is not None:
			
 
				+                #                     pack_text += head+inner_table[i][j][0]+"，"
			
 
				+                #                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题，if 改elif
			
 
				+                #                     #排名替换为同一种表达
			
 
				+                #                     rank_text += head+inner_table[i][j][0]+"，"
			
 
				+                #                     #print(rank_text)
			
 
				+                #                 elif re.search(entityPattern,head) is not None:
			
 
				+                #                     entity_text += head+inner_table[i][j][0]+"，"
			
 
				+                #                     #print(entity_text)
			
 
				+                #                 else:
			
 
				+                #                     text_line += head+inner_table[i][j][0]+"，"
			
 
				+                #                 text_set.add(str(head+inner_table[i][j][0]))
			
 
				+                #         text += pack_text+rank_text+entity_text+text_line
			
 
				+                #         text = text[:-1]+"。" if len(text)>0 else text
			
 
				+                # else:
			
 
				+                #     for j in range(width):
			
 
				+                #
			
 
				+                #         rank_text = ""
			
 
				+                #         entity_text = ""
			
 
				+                #         text_line = ""
			
 
				+                #         text_set = set()
			
 
				+                #         for i in range(head_begin,head_end):
			
 
				+                #             cell = inner_table[i][j]
			
 
				+                #             #是属性值
			
 
				+                #             if cell[1]==0 and cell[0]!="":
			
 
				+                #                 find_flag = False
			
 
				+                #                 head = ""
			
 
				+                #                 temp_head = ""
			
 
				+                #
			
 
				+                #                 for loop_j in range(1,j+1):
			
 
				+                #                     if not key_direct:
			
 
				+                #                         key_values = [1,2]
			
 
				+                #                     else:
			
 
				+                #                         key_values = [2]
			
 
				+                #                     if inner_table[i][j-loop_j][1] in key_values:
			
 
				+                #                         if find_flag:
			
 
				+                #                             if inner_table[i][j-loop_j][0]!=temp_head:
			
 
				+                #                                 head = inner_table[i][j-loop_j][0]+":"+head
			
 
				+                #                         else:
			
 
				+                #                             head = inner_table[i][j-loop_j][0]+":"+head
			
 
				+                #                         find_flag = True
			
 
				+                #                         temp_head = inner_table[i][j-loop_j][0]
			
 
				+                #                     else:
			
 
				+                #                         if find_flag:
			
 
				+                #                             break
			
 
				+                #                 find_flag = False
			
 
				+                #                 temp_head = ""
			
 
				+                #                 for loop_i in range(0,i+1-head_begin):
			
 
				+                #                     if not key_direct:
			
 
				+                #                         key_values = [1,2]
			
 
				+                #                     else:
			
 
				+                #                         key_values = [1]
			
 
				+                #                     if inner_table[i-loop_i][j][1] in key_values:
			
 
				+                #                         if find_flag:
			
 
				+                #                             if inner_table[i-loop_i][j][0]!=temp_head:
			
 
				+                #                                 head = inner_table[i-loop_i][j][0]+":"+head
			
 
				+                #                         else:
			
 
				+                #                             head = inner_table[i-loop_i][j][0]+":"+head
			
 
				+                #                         find_flag = True
			
 
				+                #                         temp_head = inner_table[i-loop_i][j][0]
			
 
				+                #                     else:
			
 
				+                #                         if find_flag:
			
 
				+                #                             break
			
 
				+                #                 if str(head+inner_table[i][j][0]) in text_set:
			
 
				+                #                     continue
			
 
				+                #                 if re.search(rankPattern,head) is not None:
			
 
				+                #                     rank_text += head+inner_table[i][j][0]+"，"
			
 
				+                #                     #print(rank_text)
			
 
				+                #                 elif re.search(entityPattern,head) is not None:
			
 
				+                #                     entity_text += head+inner_table[i][j][0]+"，"
			
 
				+                #                     #print(entity_text)
			
 
				+                #                 else:
			
 
				+                #                     text_line += head+inner_table[i][j][0]+"，"
			
 
				+                #                 text_set.add(str(head+inner_table[i][j][0]))
			
 
				+                #         text += rank_text+entity_text+text_line
			
 
				+                #         text = text[:-1]+"。" if len(text)>0 else text
			
 
				         return text
			
 
				     
			
 
				     def removeFix(inner_table,fix_value="~~"):
			
@@ -948,12 +988,13 @@ def tableToText(soup):
 
				     # return list_innerTable
			
 
				 
			
 
				 #数据清洗
			
 
				-def segment(soup):
			
 
				+def segment(soup,final=True):
			
 
				     # print("==")
			
 
				     # print(soup)
			
 
				     # print("====")
			
 
				     #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
			
 
				-    if soup.name=="td":
			
 
				+    subspaceList = ["td",'a',"span","p"]
			
 
				+    if soup.name in subspaceList:
			
 
				         #判断有值叶子节点数
			
 
				         _count = 0
			
 
				         for child in soup.find_all(recursive=True):
			
@@ -966,27 +1007,26 @@ def segment(soup):
 
				                 if '...' in soup.get_text() and (soup.get_text()[:-3]).strip() in soup.attrs['title']:
			
 
				                     text = soup.attrs['title']
			
 
				 
			
 
				-            _list = []
			
 
				-            for x in re.split("\s+",text):
			
 
				-                if x.strip()!="":
			
 
				-                    _list.append(len(x))
			
 
				-            if len(_list)>0:
			
 
				-                _minLength = min(_list)
			
 
				-                if _minLength>2:
			
 
				-                    _substr = "，"
			
 
				-                else:
			
 
				-                    _substr = ""
			
 
				-            else:
			
 
				-                _substr = ""
			
 
				-            text = _substr.join(re.split("(\s+)",text))
			
 
				+            # _list = []
			
 
				+            # for x in re.split("\s+",text):
			
 
				+            #     if x.strip()!="":
			
 
				+            #         _list.append(len(x))
			
 
				+            # if len(_list)>0:
			
 
				+            #     _minLength = min(_list)
			
 
				+            #     if _minLength>2:
			
 
				+            #         _substr = "，"
			
 
				+            #     else:
			
 
				+            #         _substr = ""
			
 
				+            # else:
			
 
				+            #     _substr = ""
			
 
				             text = text.replace("\r\n","，").replace("\n","，")
			
 
				-            text = re.sub("^[，\s]*|[，\s]*$","",text)
			
 
				+            text = re.sub("\s+","##space##",text)
			
 
				             return text
			
 
				     segList = ["title"]
			
 
				     commaList = ["div","br","td","p"]
			
 
				     #commaList = []
			
 
				     spaceList = ["span"]
			
 
				-    subspaceList = ["td",'a',"span","p"]
			
 
				+
			
 
				     tbodies = soup.find_all('tbody')
			
 
				     if len(tbodies) == 0:
			
 
				         tbodies = soup.find_all('table')
			
@@ -1000,8 +1040,8 @@ def segment(soup):
 
				         # if child.name in subspaceList:
			
 
				         #     child.insert_before("#subs"+str(child.name)+"#")
			
 
				         #     child.insert_after("#sube"+str(child.name)+"#")
			
 
				-        if child.name in spaceList:
			
 
				-            child.insert_after(" ")
			
 
				+        # if child.name in spaceList:
			
 
				+        #     child.insert_after(" ")
			
 
				     text = str(soup.get_text())
			
 
				 
			
 
				     #替换英文冒号为中文冒号
			
@@ -1012,67 +1052,56 @@ def segment(soup):
 
				     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])","；",text)
			
 
				     
			
 
				          
			
 
				-    #删除标签中的所有空格
			
 
				-    for subs in subspaceList:
			
 
				-        patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
			
 
				-        while(True):
			
 
				-            oneMatch = re.search(re.compile(patten),text)
			
 
				-            if oneMatch is not None:
			
 
				-                _match = oneMatch.group(1)
			
 
				-                _minLength = min([len(x) for x in re.split("(\s*)",_match)])
			
 
				-                if _minLength>2:
			
 
				-                    _substr = "，"
			
 
				-                else:
			
 
				-                    _substr = ""
			
 
				-                text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s",_substr,oneMatch.group(1)))
			
 
				-            else:
			
 
				-                break
			
 
				-    
			
 
				-    
			
 
				+
			
 
				     #替换"""为"“",否则导入deepdive出错
			
 
				     text = text.replace('"',"“").replace("\r","").replace("\n","，")
			
 
				     text = re.sub("\s{4,}","，",text)   
			
 
				     #替换标点
			
 
				-    while(True):
			
 
				-        #替换连续的标点
			
 
				-        punc = re.search("，(?P<punc>：|。|，|；)\s*",text)
			
 
				-        if punc is not None:
			
 
				-            text = re.sub("，"+punc.group("punc")+"\s*",punc.group("punc"),text)
			
 
				+
			
 
				+    #替换连续的标点
			
 
				+
			
 
				+    punc_pattern = "(?P<del>[。，；：:,\s]+)"
			
 
				+
			
 
				+    list_punc = re.findall(punc_pattern,text)
			
 
				+    list_punc.sort(key=lambda x:len(x),reverse=True)
			
 
				+    for punc_del in list_punc:
			
 
				+        if len(punc_del)>1:
			
 
				+            text = re.sub(punc_del,punc_del[-1],text)
			
 
				         
			
 
				-        punc = re.search("(?P<punc>：|。|，|；)\s*，",text)
			
 
				-        if punc is not None:
			
 
				-            text = re.sub(punc.group("punc")+"\s*，",punc.group("punc"),text)
			
 
				-        else:
			
 
				-            #替换标点之后的空格
			
 
				-            punc = re.search("(?P<punc>：|。|，|；)\s+",text)
			
 
				-            if punc is not None:
			
 
				-                text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
			
 
				-            else:
			
 
				-                break
			
 
				+
			
 
				     #将连续的中文句号替换为一个
			
 
				     text_split = text.split("。")
			
 
				     text_split = [x for x in text_split if len(x)>0]
			
 
				-    list_text = []
			
 
				-    # for _t in text_split:
			
 
				-    #     list_text.append(re.sub("）",")",re.sub("（","(",re.sub("\s*","",_t))))
			
 
				     text = "。".join(text_split)
			
 
				-    # text = text.replace('）',")").replace("（","(").replace("\s","")
			
 
				-    #删除所有空格
			
 
				+
			
 
				+    # #删除标签中的所有空格
			
 
				+    # for subs in subspaceList:
			
 
				+    #     patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
			
 
				+    #     while(True):
			
 
				+    #         oneMatch = re.search(re.compile(patten),text)
			
 
				+    #         if oneMatch is not None:
			
 
				+    #             _match = oneMatch.group(1)
			
 
				+    #             text = text.replace("#subs"+str(subs)+"#"+_match+"#sube"+str(subs)+"#",_match)
			
 
				+    #         else:
			
 
				+    #             break
			
 
				+
			
 
				     # text过大报错
			
 
				     LOOP_LEN = 10000
			
 
				     LOOP_BEGIN = 0
			
 
				     _text = ""
			
 
				+
			
 
				+
			
 
				+
			
 
				     if len(text)<10000000:
			
 
				         while(LOOP_BEGIN<len(text)):
			
 
				-            _text += re.sub("）",")",re.sub("（","(",re.sub("\s*","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
			
 
				+            _text += re.sub("）",")",re.sub("（","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
			
 
				             LOOP_BEGIN += LOOP_LEN
			
 
				-    else:
			
 
				-        return text
			
 
				-    # text = re.sub("\s*","",text)
			
 
				-    # #替换中文括号为英文括号
			
 
				-    # text = re.sub("（","(",text)
			
 
				-    # text = re.sub("）",")",text)
			
 
				-    return _text
			
 
				+        text = _text
			
 
				+
			
 
				+    if final:
			
 
				+        text = re.sub("##space##"," ",text)
			
 
				+
			
 
				+    return text
			
 
				 
			
 
				 '''
			
 
				 #数据清洗
			
--- a/iepy/selfpreprocess/BiddingKG/dl/interface/getAttributes.py
+++ b/iepy/selfpreprocess/BiddingKG/dl/interface/getAttributes.py
@@ -1177,6 +1177,9 @@ def getOtherAttributes(list_entity):
 
				             dict_other["time_bidclose"] = timeFormat(entity.entity_text)
			
 
				         elif entity.entity_type=="person" and entity.label ==4:
			
 
				             dict_other["person_review"].append(entity.entity_text)
			
 
				+        elif entity.entity_type=='product':
			
 
				+            dict_other["product"].append(entity.entity_text)
			
 
				+    dict_other["product"] = list(set(dict_other["product"]))
			
 
				     return dict_other
			
 
				 
			
 
				 
			
--- a/iepy/selfpreprocess/BiddingKG/dl/interface/predictor.py
+++ b/iepy/selfpreprocess/BiddingKG/dl/interface/predictor.py
@@ -16,6 +16,8 @@ sys.path.append(os.path.abspath("../.."))
 
				 from BiddingKG.dl.common.Utils import *
			
 
				 from BiddingKG.dl.interface.modelFactory import *
			
 
				 import tensorflow as tf
			
 
				+from tensorflow.python.framework import graph_util
			
 
				+from BiddingKG.dl.product.data_util import decode, process_data
			
 
				 from BiddingKG.dl.interface.Entitys import Entity
			
 
				 
			
 
				 from threading import RLock
			
@@ -223,7 +225,7 @@ class CodeNamePredict():
 
				             list_entitys = [[] for _ in range(len(list_sentences))]
			
 
				         for list_sentence,list_entity in zip(list_sentences,list_entitys):
			
 
				             if len(list_sentence)==0:
			
 
				-                result.append([list_sentence[0].doc_id,{"code":[],"name":""}])
			
 
				+                result.append([{"code":[],"name":""}])
			
 
				                 continue
			
 
				             doc_id = list_sentence[0].doc_id
			
 
				             # sentences = []
			
@@ -522,7 +524,9 @@ class PREMPredict():
 
				         data_x = []
			
 
				         points_entitys = []
			
 
				         for list_entity,list_sentence in zip(list_entitys,list_sentences):
			
 
				-            
			
 
				+
			
 
				+            list_entity.sort(key=lambda x:x.sentence_index)
			
 
				+            list_sentence.sort(key=lambda x:x.sentence_index)
			
 
				             p_entitys = 0
			
 
				             p_sentences = 0
			
 
				             while(p_entitys<len(list_entity)):
			
@@ -557,7 +561,9 @@ class PREMPredict():
 
				         data_x = []
			
 
				         points_entitys = []
			
 
				         for list_entity,list_sentence in zip(list_entitys,list_sentences):
			
 
				-            
			
 
				+
			
 
				+            list_entity.sort(key=lambda x:x.sentence_index)
			
 
				+            list_sentence.sort(key=lambda x:x.sentence_index)
			
 
				             p_entitys = 0
			
 
				     
			
 
				             while(p_entitys<len(list_entity)):
			
@@ -583,10 +589,12 @@ class PREMPredict():
 
				     
			
 
				     def predict_role(self,list_sentences, list_entitys):
			
 
				         datas = self.search_role_data(list_sentences, list_entitys)
			
 
				+
			
 
				         if datas is None:
			
 
				             return
			
 
				         points_entitys = datas[1]
			
 
				-        
			
 
				+
			
 
				+
			
 
				         if USE_PAI_EAS:
			
 
				             _data = datas[0]
			
 
				             _data = np.transpose(np.array(_data),(1,0,2))
			
@@ -1134,7 +1142,7 @@ class TimePredictor():
 
				         self.sess = tf.Session(graph=tf.Graph())
			
 
				         self.inputs_code = None
			
 
				         self.outputs_code = None
			
 
				-        self.input_shape = (2,30,60)
			
 
				+        self.input_shape = (2,10,128)
			
 
				         self.load_model()
			
 
				 
			
 
				     def load_model(self):
			
@@ -1168,10 +1176,13 @@ class TimePredictor():
 
				                     while(p_sentences<len(list_sentence)):
			
 
				                         sentence = list_sentence[p_sentences]
			
 
				                         if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
			
 
				-                            left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
			
 
				-                            right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
			
 
				+                            # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
			
 
				+                            # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
			
 
				+                            s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
			
 
				+                            left = s[0]
			
 
				+                            right = s[1]
			
 
				                             context = [left, right]
			
 
				-                            x = embedding_word(context, shape=self.input_shape)
			
 
				+                            x = embedding(context, shape=self.input_shape)
			
 
				                             data_x.append(x)
			
 
				                             points_entitys.append(entity)
			
 
				                             break
			
@@ -1198,6 +1209,80 @@ class TimePredictor():
 
				                     values.append(item)
			
 
				                     entity.set_Role(label, values)
			
 
				 
			
 
				+# 产品字段提取
			
 
				+class ProductPredictor():
			
 
				+    def __init__(self):
			
 
				+        self.sess = tf.Session(graph=tf.Graph())
			
 
				+        self.load_model()
			
 
				+
			
 
				+    def load_model(self):
			
 
				+        model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
			
 
				+        with self.sess.as_default():
			
 
				+            with self.sess.graph.as_default():
			
 
				+                output_graph_def = tf.GraphDef()
			
 
				+                with open(model_path, 'rb') as f:
			
 
				+                    output_graph_def.ParseFromString(f.read())
			
 
				+                    tf.import_graph_def(output_graph_def, name='')
			
 
				+                    self.sess.run(tf.global_variables_initializer())
			
 
				+                    self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
			
 
				+                    self.length = self.sess.graph.get_tensor_by_name("Sum:0")
			
 
				+                    self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
			
 
				+                    self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
			
 
				+                    self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
			
 
				+
			
 
				+    def predict(self, list_sentences,list_entitys=None, MAX_AREA=5000):
			
 
				+        '''
			
 
				+        预测实体代码，每个句子最多取MAX_AREA个字，超过截断
			
 
				+        :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
			
 
				+        :param list_entitys: 多篇公告实体列表
			
 
				+        :param MAX_AREA: 每个句子最多截取多少字
			
 
				+        :return: 把预测出来的实体放进实体类
			
 
				+        '''
			
 
				+        with self.sess.as_default() as sess:
			
 
				+            with self.sess.graph.as_default():
			
 
				+                result = []
			
 
				+                if list_entitys is None:
			
 
				+                    list_entitys = [[] for _ in range(len(list_sentences))]
			
 
				+                for list_sentence, list_entity in zip(list_sentences,list_entitys):
			
 
				+                    if len(list_sentence)==0:
			
 
				+                        result.append({"product":[]})
			
 
				+                        continue
			
 
				+                    list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
			
 
				+                    _begin_index = 0
			
 
				+                    item = {"product":[]}
			
 
				+                    temp_list = []
			
 
				+                    while True:
			
 
				+                        MAX_LEN = len(list_sentence[_begin_index].sentence_text)
			
 
				+                        if MAX_LEN > MAX_AREA:
			
 
				+                            MAX_LEN = MAX_AREA
			
 
				+                        _LEN = MAX_AREA//MAX_LEN
			
 
				+                        chars = process_data([sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]])
			
 
				+                        lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
			
 
				+                                                          feed_dict={
			
 
				+                                                                    self.char_input: np.asarray(chars),
			
 
				+                                                                    self.dropout: 1.0
			
 
				+                                                                    })
			
 
				+                        batch_paths = decode(scores, lengths, tran_)
			
 
				+                        for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
			
 
				+                            tags = ''.join([str(it) for it in path[:length]])
			
 
				+                            for it in re.finditer("12*3", tags):
			
 
				+                                start = it.start()
			
 
				+                                end = it.end()
			
 
				+                                _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
			
 
				+                                sentence.doc_id, sentence.sentence_index, start, end),
			
 
				+                                                 entity_text=sentence.sentence_text[start:end],
			
 
				+                                                 entity_type="product", sentence_index=sentence.sentence_index,
			
 
				+                                                 begin_index=0, end_index=0, wordOffset_begin=start,
			
 
				+                                                 wordOffset_end=end)
			
 
				+                                list_entity.append(_entity)
			
 
				+                                temp_list.append(sentence.sentence_text[start:end])
			
 
				+                        item["product"] = list(set(temp_list))
			
 
				+                        result.append(item)
			
 
				+                        if _begin_index+_LEN >= len(list_sentence):
			
 
				+                            break
			
 
				+                        _begin_index += _LEN
			
 
				+                return result
			
 
				+
			
 
				 def getSavedModel():
			
 
				     #predictor = FormPredictor()
			
 
				     graph = tf.Graph()
			
@@ -1559,6 +1644,7 @@ def save_timesplit_model():
 
				                                                "input1":time_model.input[1]},
			
 
				                                        outputs={"outputs":time_model.output})
			
 
				 
			
 
				+
			
 
				 if __name__=="__main__":
			
 
				     #save_role_model()
			
 
				     # save_codename_model()
			
--- a/iepy/selfpreprocess/BiddingKG/dl/interface/product_savedmodel/product.pb
+++ b/iepy/selfpreprocess/BiddingKG/dl/interface/product_savedmodel/product.pb
--- a/iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/saved_model.pb
+++ b/iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/saved_model.pb
--- a/iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001
+++ b/iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001
--- a/iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/variables/variables.index
+++ b/iepy/selfpreprocess/BiddingKG/dl/interface/timesplit_model/variables/variables.index
--- a/iepy/selfpreprocess/BiddingKG/dl/product/__init__.py
+++ b/iepy/selfpreprocess/BiddingKG/dl/product/__init__.py
--- a/iepy/selfpreprocess/BiddingKG/dl/product/data_util.py
+++ b/iepy/selfpreprocess/BiddingKG/dl/product/data_util.py
@@ -0,0 +1,155 @@
 
				+#!/usr/bin/python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author  : bidikeji
			
 
				+# @Time    : 2021/1/13 0013 14:19
			
 
				+import re
			
 
				+import math
			
 
				+import random
			
 
				+import psycopg2
			
 
				+import numpy as np
			
 
				+from tensorflow.contrib.crf import viterbi_decode
			
 
				+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word
			
 
				+
			
 
				+id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
			
 
				+word_model = getModel_word()
			
 
				+vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
			
 
				+word2id = {k: v for v, k in enumerate(vocab)}
			
 
				+max_id = len(vocab)
			
 
				+conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.101')
			
 
				+cursor = conn.cursor()
			
 
				+
			
 
				+def get_label_data():
			
 
				+    sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 \
			
 
				+      and creation_date > to_timestamp('2021-01-14 00:00:00','yyyy-MM-dd HH24:mi:ss');"
			
 
				+    cursor.execute(sql)
			
 
				+    writer = open('label_data.txt', 'w', encoding='utf-8')
			
 
				+    datas = []
			
 
				+    for row in cursor.fetchall():
			
 
				+        docid = row[0]
			
 
				+        text = row[1]
			
 
				+        # string = list(text)
			
 
				+        tags = [0]*len(text)
			
 
				+        sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
			
 
				+        cursor.execute(sql_lb)
			
 
				+        for row_lb in cursor.fetchall():
			
 
				+            label = row_lb[0]
			
 
				+            _, _, begin, end, _ = re.split('\s',label)
			
 
				+            begin = int(begin)
			
 
				+            end = int(end)
			
 
				+            if end-begin>=2:
			
 
				+                tags[begin]=1
			
 
				+                tags[end-1]=3
			
 
				+                for i in range(begin+1,end-1):
			
 
				+                    tags[i]=2
			
 
				+        # datas.append([string, tags])
			
 
				+        text_sentence = []
			
 
				+        ids_sentence = []
			
 
				+        tag_sentence = []
			
 
				+        for i in range(len(text)):
			
 
				+            text_sentence.append(text[i])
			
 
				+            ids_sentence.append(word2id.get(text[i], max_id))
			
 
				+            tag_sentence.append(tags[i])
			
 
				+            writer.write("%s\t%s\n"%(text[i],tags[i]))
			
 
				+            if text[i] in ['。','?','!','；']:
			
 
				+                writer.write('\n')
			
 
				+                if text_sentence:
			
 
				+                    if len(text_sentence) > 100:
			
 
				+                    # if len(text_sentence)>5 and len(text_sentence)<1000:
			
 
				+                        datas.append([text_sentence, ids_sentence,tag_sentence])
			
 
				+                    elif len(text_sentence) > 5:
			
 
				+                        continue
			
 
				+                    else:
			
 
				+                        print('单句小于5或大于100，句子长度为：%d,文章ID：%s'%(len(text_sentence), docid))
			
 
				+                    text_sentence = []
			
 
				+                    ids_sentence = []
			
 
				+                    tag_sentence = []
			
 
				+        if text_sentence:
			
 
				+            if len(text_sentence) > 5:
			
 
				+            # if len(text_sentence) > 5 and len(text_sentence) < 1000:
			
 
				+                datas.append([text_sentence, ids_sentence, tag_sentence])
			
 
				+            else:
			
 
				+                print('单句小于5或大于100，句子长度为：%d,文章ID：%s' % (len(text_sentence), docid))
			
 
				+    writer.close()
			
 
				+    return datas
			
 
				+
			
 
				+def input_from_line(line):
			
 
				+    string = list(line)
			
 
				+    ids = [word2id.get(k, max_id) for k in string]
			
 
				+    tags = []
			
 
				+    return [[string], [ids], [tags]]
			
 
				+def process_data(sentences):
			
 
				+    '''
			
 
				+    字符串数字化并统一长度
			
 
				+    :param sentences: 文章分句字符串列表['招标公告','招标代理']
			
 
				+    :return: 数字化后的统一长度
			
 
				+    '''
			
 
				+    maxLen = max([len(sentence) for sentence in sentences])
			
 
				+    tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
			
 
				+    pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
			
 
				+    return pad_tags
			
 
				+
			
 
				+def get_ner(BIE_tag):
			
 
				+    ner = set()
			
 
				+    for it in re.finditer('BI*E',BIE_tag):
			
 
				+        ner.add((it.start(),it.end()))
			
 
				+    return ner
			
 
				+
			
 
				+def decode(logits, lengths, matrix):
			
 
				+    paths = []
			
 
				+    small = -1000.0
			
 
				+    start = np.asarray([[small]*4+[0]])
			
 
				+    for score, length in zip(logits, lengths):
			
 
				+        score = score[:length]
			
 
				+        pad = small * np.ones([length, 1])
			
 
				+        logits = np.concatenate([score, pad], axis=1)
			
 
				+        logits = np.concatenate([start, logits], axis=0)
			
 
				+        path, _  = viterbi_decode(logits, matrix)
			
 
				+        paths.append(path[1:])
			
 
				+    return paths
			
 
				+
			
 
				+def result_to_json(line, tags):
			
 
				+    result = []
			
 
				+    ner = []
			
 
				+    tags = ''.join([str(it) for it in tags])
			
 
				+    for it in re.finditer("12*3", tags):
			
 
				+        start = it.start()
			
 
				+        end = it.end()
			
 
				+        ner.append([line[start:end], (start, end)])
			
 
				+    result.append([line, ner])
			
 
				+    print(tags)
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+class BatchManager(object):
			
 
				+    def __init__(self, data, batch_size):
			
 
				+        self.batch_data = self.sort_and_pad(data, batch_size)
			
 
				+        self.len_data = len(self.batch_data)
			
 
				+
			
 
				+    def sort_and_pad(self, data, batch_size):
			
 
				+        num_batch = int(math.ceil(len(data)/batch_size))
			
 
				+        sorted_data = sorted(data, key=lambda x:len(x[0]))
			
 
				+        print('最小句子长度：%d；最大句子长度：%d' % (len(sorted_data[0][0]), len(sorted_data[-1][0])))  # 临时增加打印句子长度
			
 
				+        batch_data = list()
			
 
				+        for i in range(num_batch):
			
 
				+            batch_data.append(self.pad_data(sorted_data[i*int(batch_size):(i+1)*int(batch_size)]))
			
 
				+        return batch_data
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def pad_data(data):
			
 
				+        strings = []
			
 
				+        chars = []
			
 
				+        targets = []
			
 
				+        max_length = max([len(sentence[0]) for sentence in data])
			
 
				+        for line in data:
			
 
				+            string, char, target = line
			
 
				+            padding = [0]*(max_length-len(string))
			
 
				+            strings.append(string + padding)
			
 
				+            chars.append(char + padding)
			
 
				+            targets.append(target + padding)
			
 
				+        return [strings, chars, targets]
			
 
				+
			
 
				+    def iter_batch(self, shuffle=False):
			
 
				+        if shuffle:
			
 
				+            random.shuffle(self.batch_data)
			
 
				+        for idx in range(self.len_data):
			
 
				+            yield self.batch_data[idx]
			
--- a/iepy/selfpreprocess/self_preprocess.py
+++ b/iepy/selfpreprocess/self_preprocess.py
@@ -18,31 +18,53 @@ import iepy.selfpreprocess.BiddingKG.dl.interface.getAttributes as getAttributes
 
				 import iepy.selfpreprocess.BiddingKG.dl.entityLink.entityLink as entityLink
			
 
				 import json
			
 
				 from iepy.selfpreprocess.pipeline import PreProcessSteps
			
 
				+import iepy.selfpreprocess.BiddingKG.dl.complaint.punish_predictor as punish_rule
			
 
				 
			
 
				 from iepy.webui.brat.src import annotator
			
 
				 
			
 
				 
			
 
				 logger = logging.getLogger(__name__)
			
 
				 
			
 
				+
			
 
				 codeNamePredict = predictor.CodeNamePredict()
			
 
				 premPredict = predictor.PREMPredict()
			
 
				 epcPredict = predictor.EPCPredict()
			
 
				 roleRulePredict = predictor.RoleRulePredictor()
			
 
				 timePredict = predictor.TimePredictor()
			
 
				+punish = punish_rule.Punish_Extract()
			
 
				+productPredict = predictor.ProductPredictor()
			
 
				 
			
 
				 def predict(doc_id,text):
			
 
				 
			
 
				     log("process %s"%doc_id)
			
 
				+
			
 
				     list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
			
 
				 
			
 
				     codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
			
 
				+    print(codeName)
			
 
				     premPredict.predict(list_sentences,list_entitys)
			
 
				+    productPredict.predict(list_sentences,list_entitys)
			
 
				     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
			
 
				+    print("epcPredict")
			
 
				     epcPredict.predict(list_sentences,list_entitys)
			
 
				+    print("entityLink")
			
 
				     timePredict.predict(list_sentences, list_entitys)
			
 
				+    print("timePredict")
			
 
				     entityLink.link_entitys(list_entitys)
			
 
				-    _prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
			
 
				-    log("extract done %s"%(str(_prem)))
			
 
				+    print("getPREMs")
			
 
				+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
			
 
				+
			
 
				+
			
 
				+    # codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
			
 
				+    # productPredict.predict(list_sentences,list_entitys)
			
 
				+    # premPredict.predict(list_sentences,list_entitys)
			
 
				+    # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
			
 
				+    # epcPredict.predict(list_sentences,list_entitys)
			
 
				+    # timePredict.predict(list_sentences, list_entitys)
			
 
				+    # entityLink.link_entitys(list_entitys)
			
 
				+    # prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
			
 
				+    #
			
 
				+    log("extract done %s"%(str(prem)))
			
 
				     return list_articles,list_sentences,list_entitys
			
 
				 
			
 
				 
			
@@ -135,6 +157,7 @@ class SelfAnalizer():
 
				         dict_sentences = dict()
			
 
				         offset_word = 0
			
 
				         offset_words = 0
			
 
				+        self.sentences.sort(key=lambda x:x.sentence_index)
			
 
				         for sentence in self.sentences:
			
 
				             # print(len(sentence.sentence_text),sentence.sentence_text)
			
 
				             if sentence.sentence_index not in dict_sentences:
			
@@ -231,39 +254,78 @@ class SelfAnalizer():
 
				 
			
 
				     def generate_spans_relations(self):
			
 
				         print("%s entity length:%d"%(self.docid,len(self.entitys)))
			
 
				+        list_pre_label = []
			
 
				         for _entity in self.entitys:
			
 
				             doc_id = _entity.doc_id
			
 
				             offset = [[self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin,self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end]]
			
 
				             _type = getType(_entity)
			
 
				             ann_id = annotator.create_span_interface(document=doc_id,offsets=offset,_type=_type)
			
 
				             _entity.ann_id = ann_id
			
 
				+            _label = "T|%s|%d|%d"%(_type,offset[0][0],offset[0][1])
			
 
				+            list_pre_label.append(_label)
			
 
				         for _entity in self.entitys:
			
 
				             if _entity.pointer_pack is not None:
			
 
				                 origin = _entity.ann_id
			
 
				                 target = _entity.pointer_pack.ann_id
			
 
				                 _type = dict_relations["pointer_pack"]
			
 
				                 annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
			
 
				+                origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
			
 
				+                origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
			
 
				+                p_target = _entity.pointer_pack
			
 
				+                target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
			
 
				+                target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
			
 
				+                _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
			
 
				+                list_pre_label.append(_label)
			
 
				             if _entity.pointer_money is not None:
			
 
				                 origin = _entity.ann_id
			
 
				                 target = _entity.pointer_money.ann_id
			
 
				                 # print("$$$$$$$$",_entity.pointer_money.entity_text)
			
 
				                 _type = dict_relations["pointer_money"]
			
 
				                 annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
			
 
				+                origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
			
 
				+                origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
			
 
				+                p_target = _entity.pointer_money
			
 
				+                target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
			
 
				+                target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
			
 
				+                _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
			
 
				+                list_pre_label.append(_label)
			
 
				             if _entity.pointer_person is not None:
			
 
				                 origin = _entity.ann_id
			
 
				                 target = _entity.pointer_person.ann_id
			
 
				                 _type = dict_relations["pointer_person"]
			
 
				                 annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
			
 
				+                origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
			
 
				+                origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
			
 
				+                p_target = _entity.pointer_person
			
 
				+                target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
			
 
				+                target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
			
 
				+                _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
			
 
				+                list_pre_label.append(_label)
			
 
				             if _entity.pointer_address is not None:
			
 
				                 origin = _entity.ann_id
			
 
				                 target = _entity.pointer_address.ann_id
			
 
				                 _type = dict_relations["pointer_address"]
			
 
				                 annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
			
 
				+                origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
			
 
				+                origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
			
 
				+                p_target = _entity.pointer_address
			
 
				+                target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
			
 
				+                target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
			
 
				+                _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
			
 
				+                list_pre_label.append(_label)
			
 
				             if _entity.pointer_tendereeMoney is not None:
			
 
				                 origin = _entity.ann_id
			
 
				                 target = _entity.pointer_tendereeMoney.ann_id
			
 
				                 _type = dict_relations["pointer_tendereeMoney"]
			
 
				                 annotator.create_arc_interface(document=_entity.doc_id,origin=origin,target=target,type=_type)
			
 
				+                origin_begin = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_begin
			
 
				+                origin_end = self.dict_sentences[_entity.sentence_index]["offset_word"][0]+_entity.wordOffset_end
			
 
				+                p_target = _entity.pointer_tendereeMoney
			
 
				+                target_begin = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_begin
			
 
				+                target_end = self.dict_sentences[p_target.sentence_index]["offset_word"][0]+p_target.wordOffset_end
			
 
				+                _label = "R|%s|%d|%d|%d|%d"%(_type,origin_begin,origin_end,target_begin,target_end)
			
 
				+                list_pre_label.append(_label)
			
 
				+        return list_pre_label
			
 
				 
			
 
				 
			
 
				 
			
@@ -282,7 +344,7 @@ class SelfPreprocesser(BasePreProcessStepRunner):
 
				     def run_everything(self,document):
			
 
				         analysis = SelfAnalizer(document.human_identifier,document.sourcetext)
			
 
				         # Tokenization
			
 
				-        if len(analysis.entitys)>5 and len(analysis.entitys)<500:
			
 
				+        if len(analysis.entitys)>5 and len(analysis.entitys)<300:
			
 
				             document.text = analysis.article.content
			
 
				             tokens = analysis.get_tokens()
			
 
				             offsets = analysis.get_token_offsets()
			
@@ -310,7 +372,8 @@ class SelfPreprocesser(BasePreProcessStepRunner):
 
				 
			
 
				             # Save progress so far, next step doesn't modify `document`
			
 
				             document.save()
			
 
				-            analysis.generate_spans_relations()
			
 
				+            list_pre_label = analysis.generate_spans_relations()
			
 
				+            document.pre_label = ';'.join(list_pre_label)
			
 
				             document.brat_done_at = datetime.datetime.now()
			
 
				             document.save()
			
 
				         else:
			
@@ -324,3 +387,4 @@ if __name__=="__main__":
 
				 
			
 
				 
			
 
				 
			
 
				+
			
--- a/iepy/selfpreprocess/test4.py
+++ b/iepy/selfpreprocess/test4.py
--- a/iepy/webui/brat/models.py
+++ b/iepy/webui/brat/models.py
@@ -13,7 +13,7 @@ class BaseModel(models.Model):
 
				 
			
 
				 class BratAnnotation(BaseModel):
			
 
				 
			
 
				-    document_id = models.CharField(max_length=CHAR_LENGTH)
			
 
				+    document_id = models.CharField(max_length=CHAR_LENGTH,db_index=True)
			
 
				 
			
 
				     value = models.CharField(max_length=CHAR_LENGTH*3)
			
 
				 
			
--- a/iepy/webui/brat/src/annotation.py
+++ b/iepy/webui/brat/src/annotation.py
@@ -245,18 +245,61 @@ def get_annotations_db(filename):
 
				         list_ann.append(ann.value)
			
 
				     return "\n".join(list_ann)
			
 
				 
			
 
				+import re
			
 
				+from datetime import datetime
			
 
				+T_pattern = re.compile('(?P<T>T\d+)\t(?P<type>[^\s]+) (?P<begin>\d+) (?P<end>\d+)')
			
 
				+R_pattern = re.compile("(?P<R>R\d+)\t(?P<type>[^\s]+)[^:]*:(?P<arg1>T\d)[^:]*:(?P<arg2>T\d+)")
			
 
				+
			
 
				+import traceback
			
 
				+
			
 
				 def save_annotations_db(filename,str_ann):
			
 
				     #save the annotations to db instead of textfile
			
 
				     try:
			
 
				+        pre_label = IEDocument.objects.filter(human_identifier=filename).values("pre_label")
			
 
				+        _label = ""
			
 
				+        if len(pre_label)>0:
			
 
				+            if pre_label[0]["pre_label"] is not None:
			
 
				+                _label = pre_label[0]["pre_label"]
			
 
				+        dict_T = dict()
			
 
				+        dict_R = dict()
			
 
				+        set_pre_label = set()
			
 
				+        for _i in _label.split(";"):
			
 
				+            if _i!="":
			
 
				+                set_pre_label.add(_i)
			
 
				+        set_label = set()
			
 
				         with transaction.atomic():
			
 
				             brat_annotations.objects.filter(document_id=filename).delete()
			
 
				+            list_anno = []
			
 
				             for _str in str_ann.split("\n"):
			
 
				-                # print("======",_str,"=====")
			
 
				                 _str = _str.strip()
			
 
				                 if _str != "":
			
 
				                     ann = brat_annotations(document_id=filename,value=_str)
			
 
				-                    ann.save()
			
 
				+                    list_anno.append(ann)
			
 
				+
			
 
				+                    if _str[0]=="T":
			
 
				+                        match = re.search(T_pattern,_str)
			
 
				+                        if match is not None:
			
 
				+                            match = match.groupdict()
			
 
				+                            dict_T[match["T"]] = {"type":match["type"],"begin":match["begin"],"end":match["end"]}
			
 
				+                    if _str[0]=="R":
			
 
				+                        match = re.search(R_pattern,_str)
			
 
				+                        if match is not None:
			
 
				+                            match = match.groupdict()
			
 
				+                            dict_R[match["R"]] = {"type":match["type"],"arg1":match["arg1"],"arg2":match["arg2"]}
			
 
				+            if len(list_anno)>0:
			
 
				+                brat_annotations.objects.bulk_create(list_anno)
			
 
				+            for _T,_v in dict_T.items():
			
 
				+                set_label.add("T|%s|%d|%d"%(_v["type"],int(_v["begin"]),int(_v["end"])))
			
 
				+            for _R,_v in dict_R.items():
			
 
				+                set_label.add("R|%s|%d|%d|%d|%d"%(_v["type"],int(dict_T[_v["arg1"]]["begin"]),int(dict_T[_v["arg1"]]["end"]),int(dict_T[_v["arg2"]]["begin"]),int(dict_T[_v["arg2"]]["end"])))
			
 
				+            union_set = set_pre_label&set_label
			
 
				+            deleted = len(set_pre_label)-len(union_set)
			
 
				+            added = len(set_label)-len(union_set)
			
 
				+            IEDocument.objects.filter(human_identifier=filename).update(reedittime=datetime.now(),deleted=deleted,added=added)
			
 
				+
			
 
				+
			
 
				     except Exception as e:
			
 
				+        traceback.print_exc()
			
 
				         logger.warn("document %s save error"%filename)
			
 
				 
			
 
				 def __split_annotation_id(id):
			
@@ -1257,7 +1300,8 @@ class TextAnnotations(Annotations):
 
				             #     return f.read()
			
 
				             #change read ways to db
			
 
				             _ieDoc = IEDocument.objects.filter(human_identifier=document)
			
 
				-            return _ieDoc.get().text
			
 
				+            logger.warn(document)
			
 
				+            return _ieDoc.first().text
			
 
				         except IOError:
			
 
				             Messager.error('Error reading document text from %s' % textfn)
			
 
				         raise AnnotationTextFileNotFoundError(document)
			
@@ -1749,14 +1793,39 @@ class BinaryRelationAnnotation(IdedAnnotation):
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    from sys import stderr, argv
			
 
				-    for ann_path_i, ann_path in enumerate(argv[1:]):
			
 
				-        print >> stderr, ("%s.) '%s' " % (ann_path_i, ann_path, )
			
 
				-                          ).ljust(80, '#')
			
 
				-        try:
			
 
				-            with Annotations(ann_path) as anns:
			
 
				-                for ann in anns:
			
 
				-                    print >> stderr, str(ann).rstrip('\n')
			
 
				-        except ImportError:
			
 
				-            # Will try to load the config, probably not available
			
 
				-            pass
			
 
				+    filename = '74800260'
			
 
				+    pre_label = IEDocument.objects.filter(human_identifier='74800260').values("pre_label")
			
 
				+    _label = ""
			
 
				+    if len(pre_label)>0:
			
 
				+        if pre_label[0]["pre_label"] is not None:
			
 
				+            _label = pre_label[0]["pre_label"]
			
 
				+    dict_T = dict()
			
 
				+    dict_R = dict()
			
 
				+    set_pre_label = set()
			
 
				+    for _i in _label.split(";"):
			
 
				+        if _i!="":
			
 
				+            set_pre_label.add(_i)
			
 
				+    set_label = set()
			
 
				+    anns = brat_annotations.objects.filter(document_id='74800260').values("value")
			
 
				+    for _str in anns:
			
 
				+        _str = _str["value"].strip()
			
 
				+        if _str != "":
			
 
				+            if _str[0]=="T":
			
 
				+                match = re.search(T_pattern,_str)
			
 
				+                if match is not None:
			
 
				+                    match = match.groupdict()
			
 
				+                    dict_T[match["T"]] = {"type":match["type"],"begin":match["begin"],"end":match["end"]}
			
 
				+            if _str[0]=="R":
			
 
				+                match = re.search(R_pattern,_str)
			
 
				+                if match is not None:
			
 
				+                    match = match.groupdict()
			
 
				+                    dict_R[match["R"]] = {"type":match["type"],"arg1":match["arg1"],"arg2":match["arg2"]}
			
 
				+    for _T,_v in dict_T.items():
			
 
				+        set_label.add("T|%s|%d|%d"%(_v["type"],int(_v["begin"]),int(_v["end"])))
			
 
				+    for _R,_v in dict_R.items():
			
 
				+        set_label.add("R|%s|%d|%d|%d|%d"%(_v["type"],int(dict_T[_v["arg1"]]["begin"]),int(dict_T[_v["arg1"]]["end"]),int(dict_T[_v["arg2"]]["begin"]),int(dict_T[_v["arg2"]]["end"])))
			
 
				+    union_set = set_pre_label&set_label
			
 
				+    deleted = len(set_pre_label)-len(union_set)
			
 
				+    added = len(set_label)-len(union_set)
			
 
				+    print(deleted,added)
			
 
				+    # IEDocument.objects.filter(human_identifier=filename).update(reedittime=datetime.now(),deleted=deleted,added=added)
			
--- a/iepy/webui/brat/src/config.py
+++ b/iepy/webui/brat/src/config.py
@@ -34,9 +34,6 @@ BASE_DIR = os.path.dirname(__file__)
 
				 # WORK_DIR = path_join(BASE_DIR, '/work')
			
 
				 DATA_DIR = BASE_DIR+"/data"
			
 
				 WORK_DIR = BASE_DIR+"/work"
			
 
				-print("%%")
			
 
				-print(BASE_DIR)
			
 
				-print(WORK_DIR)
			
 
				 # If you have installed brat as suggested in the installation
			
 
				 # instructions, you can set up BASE_DIR, DATA_DIR and WORK_DIR by
			
 
				 # removing the three lines above and deleting the initial '#'
			
--- a/iepy/webui/brat/src/dispatch.py
+++ b/iepy/webui/brat/src/dispatch.py
@@ -24,7 +24,7 @@ from convert.convert import convert
 
				 from delete import delete_collection, delete_document
			
 
				 from docimport import save_import
			
 
				 from document import (get_configuration, get_directory_information,
			
 
				-                      get_document, get_document_timestamp,moveDocument,searchLabel)
			
 
				+                      get_document, get_document_timestamp,moveDocument,searchLabel,getChangedOfDocument,getSelfLabel)
			
 
				 from download import download_collection, download_file
			
 
				 from jsonwrap import dumps
			
 
				 from message import Messager
			
@@ -50,8 +50,10 @@ def logging_no_op(collection, document, log):
 
				 DISPATCHER = {
			
 
				     'getCollectionInformation': get_directory_information,
			
 
				     'getDocument': get_document,
			
 
				+    'getChangedOfDocument':getChangedOfDocument,
			
 
				     'moveDocument':moveDocument,
			
 
				     'searchLabel':searchLabel,
			
 
				+    'getSelfLabel':getSelfLabel,
			
 
				     'getDocumentTimestamp': get_document_timestamp,
			
 
				     'importDocument': save_import,
			
 
				 
			
--- a/iepy/webui/brat/src/document.py
+++ b/iepy/webui/brat/src/document.py
@@ -45,7 +45,53 @@ from stats import get_statistics
 
				 from iepy.webui.corpus.models import IEDocument
			
 
				 from django.db.models import Q
			
 
				 from threading import RLock
			
 
				-from django.db import connection
			
 
				+from django.db import connection,transaction
			
 
				+
			
 
				+import logging
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+def getSelfLabel(time_begin,time_end,page_num,request):
			
 
				+    _limit = 20
			
 
				+    if page_num=="":
			
 
				+        page_num = 1
			
 
				+    elif int(page_num)<=0:
			
 
				+        page_num = 1
			
 
				+    _offset = _limit*(int(page_num)-1)
			
 
				+
			
 
				+    if time_begin!="":
			
 
				+        and_begin = " to_char(edittime,'yyyy-mm-dd')>='%s'" %(time_begin)
			
 
				+    else:
			
 
				+        and_begin = " 1=1 "
			
 
				+    if time_end!="":
			
 
				+        and_end = " to_char(edittime,'yyyy-mm-dd')<='%s' "%(time_end)
			
 
				+    else:
			
 
				+        and_end = " 1=1 "
			
 
				+    user = request.user.username
			
 
				+    cursor = connection.cursor()
			
 
				+    where_sql = " where edituser='%s' and %s and %s "%(user,and_begin,and_end)
			
 
				+    sql = "select count(1) from corpus_iedocument %s"%where_sql
			
 
				+    logger.warn(sql)
			
 
				+    cursor.execute(sql)
			
 
				+    all_num = cursor.fetchall()[0][0]
			
 
				+    num_page = all_num//_limit
			
 
				+    if all_num%_limit!=0:
			
 
				+        num_page += 1
			
 
				+    list_result = []
			
 
				+    where_sql = " where edituser='%s' and %s and %s offset %d limit %d "%(user,and_begin,and_end,_offset,_limit)
			
 
				+    sql = " select row_number() over(order by edittime desc) as id,human_identifier as document_id,to_char(edittime,'yyyy-mm-dd') as edittime,deleted,added from corpus_iedocument %s"%where_sql
			
 
				+    logger.warn(sql)
			
 
				+    cursor.execute(sql)
			
 
				+    vol = cursor.description
			
 
				+    rows = cursor.fetchall()
			
 
				+    for row in rows:
			
 
				+        _dict = dict()
			
 
				+        for _n,_v in zip(vol,row):
			
 
				+            _name = _n[0]
			
 
				+            _dict[_name] = _v
			
 
				+        list_result.append(_dict)
			
 
				+
			
 
				+    return {"rowsData":list_result,"all_num":all_num,"num_page":num_page,"page_num":page_num}
			
 
				 
			
 
				 def _fill_type_configuration(
			
 
				         nodes,
			
@@ -936,64 +982,84 @@ def get_document(collection, document):
 
				     # doc_path = path_join(real_dir, document)
			
 
				     return _document_json_dict(document)
			
 
				 
			
 
				+def getChangedOfDocument(document):
			
 
				+    docs = IEDocument.objects.filter(Q(human_identifier=document)).values('deleted','added')
			
 
				+    _dict = {"deleted":0,"added":0}
			
 
				+    if len(docs)>0:
			
 
				+        _dict = docs[0]
			
 
				+    return _dict
			
 
				+
			
 
				 doc_lock = RLock()
			
 
				 
			
 
				 def moveDocument(dir,document,request):
			
 
				     user = request.user.username
			
 
				     if int(dir)>0:
			
 
				         #get the next of the user edit before
			
 
				-        docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) and Q(edituser=user)).order_by("-edittime")
			
 
				+        docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) and Q(edituser=user)).order_by("-edittime").values('id','human_identifier')
			
 
				         list_docs = list(docs)
			
 
				         _index = None
			
 
				         for i in range(len(list_docs)):
			
 
				             _doc = list_docs[i]
			
 
				-            if _doc.human_identifier==document:
			
 
				+            if _doc['human_identifier']==document:
			
 
				                 _index = i
			
 
				                 break
			
 
				         if _index is None or _index==0:
			
 
				             #next
			
 
				             #use lock to gain the next document
			
 
				             with doc_lock:
			
 
				-                docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) & Q(edituser__isnull=True) & Q(jump_signal=0)).order_by("id")
			
 
				+                docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) & Q(edituser__isnull=True) & Q(jump_signal=0)).order_by("id").values('id','human_identifier')
			
 
				                 if docs.count()>0:
			
 
				-                    _doc = docs.first()
			
 
				-                    new_document = _doc.human_identifier
			
 
				-                    _doc.edittime = datetime.now()
			
 
				-                    _doc.edituser = user
			
 
				-                    _doc.save()
			
 
				+                    update_flag = False
			
 
				+                    for _doc in docs:
			
 
				+                        _doc = docs.first()
			
 
				+                        new_document = _doc['human_identifier']
			
 
				+
			
 
				+                        with transaction.atomic():
			
 
				+                            updated = IEDocument.objects.filter(
			
 
				+                                id=_doc["id"],
			
 
				+                                edituser__isnull=True
			
 
				+                            ).update(
			
 
				+                                edituser=user,
			
 
				+                                edittime=datetime.now()
			
 
				+                            )
			
 
				+                            if updated==0:
			
 
				+                                transaction.rollback
			
 
				+                            else:
			
 
				+                                update_flag = True
			
 
				+                                break
			
 
				+                    if not update_flag:
			
 
				+                        new_document = document
			
 
				+                        Messager.info("数据标注完成")
			
 
				                 else:
			
 
				                     new_document = document
			
 
				                     Messager.info("数据标注完成")
			
 
				         else:
			
 
				             _doc = list_docs[_index-1]
			
 
				-            _doc.reedittime = datetime.now()
			
 
				-            new_document = _doc.human_identifier
			
 
				+            new_document = _doc['human_identifier']
			
 
				 
			
 
				     else:
			
 
				         #prev
			
 
				         #get the prev document of the current user
			
 
				-        docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) and Q(edituser=user) ).order_by("-edittime")
			
 
				+        docs = IEDocument.objects.filter(Q(brat_done_at__isnull=False) and Q(edituser=user) ).order_by("-edittime").values('id','human_identifier')
			
 
				         if docs.count()>1:
			
 
				             list_docs = list(docs)
			
 
				             _index = None
			
 
				             for i in range(len(list_docs)):
			
 
				                 _doc = list_docs[i]
			
 
				-                if _doc.human_identifier==document:
			
 
				+                if _doc['human_identifier']==document:
			
 
				                     _index = i
			
 
				-                    _doc.reedittime = datetime.now()
			
 
				-                    _doc.save()
			
 
				                     break
			
 
				             if _index is not None:
			
 
				                 if  _index<len(list_docs)-1:
			
 
				-                    new_document = list_docs[_index+1].human_identifier
			
 
				+                    new_document = list_docs[_index+1]["human_identifier"]
			
 
				                 else:
			
 
				-                    new_document = list_docs[-1].human_identifier
			
 
				+                    new_document = list_docs[-1]["human_identifier"]
			
 
				                     Messager.info("已经是第一条数据")
			
 
				             else:
			
 
				                 if document=="":
			
 
				-                    new_document = list_docs[0].human_identifier
			
 
				+                    new_document = list_docs[0]["human_identifier"]
			
 
				                 else:
			
 
				-                    new_document = list_docs[1].human_identifier
			
 
				+                    new_document = list_docs[1]["human_identifier"]
			
 
				 
			
 
				         else:
			
 
				             new_document = ""
			
@@ -1002,13 +1068,21 @@ def moveDocument(dir,document,request):
 
				 def searchLabel(request):
			
 
				     user = request.user.username
			
 
				     cursor = connection.cursor()
			
 
				-    sql = " select human_identifier from corpus_iedocument where edituser='%s'order by edittime asc" %(user)
			
 
				+    sql = 'select max(end_time) from corpus_payroll where "user"=\'%s\''%(user)
			
 
				+    cursor.execute(sql)
			
 
				+    rows = cursor.fetchall()
			
 
				+    end_time = rows[0][0]
			
 
				+    and_sql = " and 1=1 "
			
 
				+    if end_time is not None:
			
 
				+        and_sql = " and to_char(edittime,'yyyy-mm-dd')>'%s' "%end_time
			
 
				+
			
 
				+    sql = " select human_identifier from corpus_iedocument where edituser='%s' %s order by edittime asc" %(user,and_sql)
			
 
				     cursor.execute(sql)
			
 
				     list_docid = []
			
 
				     for row in cursor.fetchall():
			
 
				         list_docid.append(row[0])
			
 
				     set_first = set(list_docid[:1200])
			
 
				-    sql = " select document_id,value from brat_bratannotation where document_id in(select human_identifier from corpus_iedocument where edituser='%s') "%(user)
			
 
				+    sql = " select document_id,value from brat_bratannotation where document_id in(select human_identifier from corpus_iedocument where edituser='%s' %s) "%(user,and_sql)
			
 
				     cursor.execute(sql)
			
 
				     eleCount = 0
			
 
				     relCount = 0
			
@@ -1033,7 +1107,7 @@ def searchLabel(request):
 
				     return {"docCount":len(list_docid),"eleCount":eleCount,"relCount":relCount,"wage":round(wage*0.9,2)}
			
 
				 
			
 
				 
			
 
				-def get_document_timestamp(collection, document):
			
 
				+def get_document_timestamp(collection, document,request):
			
 
				     directory = collection
			
 
				     real_dir = real_directory(directory)
			
 
				     assert_allowed_to_read(real_dir)
			
--- a/iepy/webui/brat/src/search.py
+++ b/iepy/webui/brat/src/search.py
@@ -17,7 +17,8 @@ DEFAULT_EMPTY_STRING = "***"
 
				 REPORT_SEARCH_TIMINGS = False
			
 
				 DEFAULT_RE_FLAGS = re.UNICODE
			
 
				 ###
			
 
				-
			
 
				+import logging
			
 
				+logger = logging.getLogger(__name__)
			
 
				 if REPORT_SEARCH_TIMINGS:
			
 
				     from sys import stderr
			
 
				     from datetime import datetime
			
@@ -172,20 +173,20 @@ def __filenames_to_annotations(filenames):
 
				                 "").replace(
			
 
				                 ".rel",
			
 
				                 "")
			
 
				-            ann_obj = annotation.TextAnnotations(nosuff_fn, read_only=True)
			
 
				-            anns.append(ann_obj)
			
 
				+            with annotation.TextAnnotations(nosuff_fn, read_only=True) as ann_obj:
			
 
				+                anns.append(ann_obj)
			
 
				         except annotation.AnnotationFileNotFoundError:
			
 
				-            print("%s:\tFailed: file not found" % fn, file=sys.stderr)
			
 
				+            logger.info("%s:\tFailed: file not found" % fn, file=sys.stderr)
			
 
				         except annotation.AnnotationNotFoundError as e:
			
 
				-            print("%s:\tFailed: %s" % (fn, e), file=sys.stderr)
			
 
				+            logger.info("%s:\tFailed: %s" % (fn, e), file=sys.stderr)
			
 
				 
			
 
				     if len(anns) != len(filenames):
			
 
				-        print("Note: only checking %d/%d given files" % (
			
 
				+        logger.info("Note: only checking %d/%d given files" % (
			
 
				             len(anns), len(filenames)), file=sys.stderr)
			
 
				 
			
 
				     if REPORT_SEARCH_TIMINGS:
			
 
				         process_delta = datetime.now() - process_start
			
 
				-        print("filenames_to_annotations: processed in", str(
			
 
				+        logger.info("filenames_to_annotations: processed in", str(
			
 
				             process_delta.seconds) + "." + str(process_delta.microseconds / 10000), "seconds", file=stderr)
			
 
				 
			
 
				     return anns
			
@@ -210,13 +211,13 @@ def __document_to_annotations(directory, document):
 
				     """Given a directory and a document, returns an Annotations object for the
			
 
				     file."""
			
 
				     # TODO: put this shared functionality in a more reasonable place
			
 
				-    from document import real_directory
			
 
				-    from os.path import join as path_join
			
 
				+    # from document import real_directory
			
 
				+    # from os.path import join as path_join
			
 
				 
			
 
				-    real_dir = real_directory(directory)
			
 
				-    filenames = [path_join(real_dir, document)]
			
 
				+    # real_dir = real_directory(directory)
			
 
				+    # filenames = [path_join(real_dir, document)]
			
 
				 
			
 
				-    return __filenames_to_annotations(filenames)
			
 
				+    return __filenames_to_annotations([document])
			
 
				 
			
 
				 
			
 
				 def __doc_or_dir_to_annotations(directory, document, scope):
			
--- a/iepy/webui/brat/static/client/src/annotator_ui.js
+++ b/iepy/webui/brat/static/client/src/annotator_ui.js
@@ -1362,6 +1362,8 @@ var AnnotatorUI = (function($, window, undefined) {
 
				 
			
 
				         // TODO: sorting on click on header (see showFileBrowser())
			
 
				       }
			
 
				+
			
 
				+
			
 
				       var performNormSearch = function() {
			
 
				         var val = $('#norm_search_query').val();
			
 
				         var db = $('#span_norm_db').val();
			
@@ -2451,6 +2453,7 @@ var AnnotatorUI = (function($, window, undefined) {
 
				         dispatcher.post('ajax', [spanOptions, 'edited']);
			
 
				         dispatcher.post('hideForm');
			
 
				         $('#waiter').dialog('open');
			
 
				+        dispatcher.post('getChangedOfDocument',[doc]);
			
 
				       };
			
 
				 
			
 
				       var reselectSpan = function() {
			
@@ -2459,6 +2462,7 @@ var AnnotatorUI = (function($, window, undefined) {
 
				         $(editedSpan.rect).addClass('reselect');
			
 
				         reselectedSpan = editedSpan;
			
 
				         selectedFragment = null;
			
 
				+        dispatcher.post('getChangedOfDocument',[doc]);
			
 
				       };
			
 
				 
			
 
				       var splitForm = $('#split_form');
			
@@ -2504,6 +2508,7 @@ var AnnotatorUI = (function($, window, undefined) {
 
				         var $roleButtons = $roles.find('input').button();
			
 
				         
			
 
				         dispatcher.post('showForm', [splitForm]);
			
 
				+        dispatcher.post('getChangedOfDocument',[doc]);
			
 
				       };
			
 
				 
			
 
				       var addFragment = function() {
			
@@ -2512,6 +2517,7 @@ var AnnotatorUI = (function($, window, undefined) {
 
				         $(editedSpan.rect).addClass('reselect');
			
 
				         reselectedSpan = editedSpan;
			
 
				         selectedFragment = false;
			
 
				+        dispatcher.post('getChangedOfDocument',[doc]);
			
 
				       };
			
 
				 
			
 
				       var reselectFragment = function() {
			
@@ -2540,6 +2546,7 @@ var AnnotatorUI = (function($, window, undefined) {
 
				         dispatcher.post('ajax', [spanOptions, 'edited']);
			
 
				         dispatcher.post('hideForm');
			
 
				         $('#waiter').dialog('open');
			
 
				+        dispatcher.post('getChangedOfDocument',[doc]);
			
 
				       };
			
 
				 
			
 
				       var spanChangeLock = function(evt) {
			
@@ -2599,6 +2606,7 @@ var AnnotatorUI = (function($, window, undefined) {
 
				             });
			
 
				             $lock.click(spanChangeLock);
			
 
				             $($span).buttonset();
			
 
				+            dispatcher.post('getChangedOfDocument',[doc]);
			
 
				           },
			
 
				           beforeClose: function(evt) {
			
 
				             // in case the form is cancelled
			
@@ -2671,6 +2679,7 @@ var AnnotatorUI = (function($, window, undefined) {
 
				 
			
 
				         $('#waiter').dialog('open');
			
 
				         dispatcher.post('ajax', [spanOptions, 'edited']);
			
 
				+        dispatcher.post('getChangedOfDocument',[doc]);
			
 
				         return false;
			
 
				       };
			
 
				       $('#span_notes').focus(function () {
			
--- a/iepy/webui/brat/static/client/src/custom.js
+++ b/iepy/webui/brat/static/client/src/custom.js
@@ -0,0 +1,51 @@
 
				+var customer = (function($, window, undefined) {
			
 
				+    var customer = function(dispatcher, svgId) {
			
 
				+
			
 
				+        function clickDocument(document){
			
 
				+            dispatcher.post('ajax', [{
			
 
				+                action: 'getDocument',
			
 
				+                collection: "/",
			
 
				+                'document': document,
			
 
				+            }, 'renderData', {
			
 
				+                collection: "/",
			
 
				+                'document': document
			
 
				+            }]);
			
 
				+        }
			
 
				+        var receiveSelfLabel = function(data){
			
 
				+            var list_header = ["id","document_id","edittime","deleted","added"]
			
 
				+            var table_str = '<table class="ui-widget-content" style="width:100%">';
			
 
				+            table_str += "<tr>";
			
 
				+            for (var j in list_header){
			
 
				+                table_str += "<td>"+list_header[j]+"</td>"
			
 
				+            }
			
 
				+            table_str += "</tr>";
			
 
				+            for(var i =0;i<data.rowsData.length;i++){
			
 
				+                table_str += "<tr>";
			
 
				+                _row = data.rowsData[i];
			
 
				+                for (var j in list_header){
			
 
				+                    var _head = list_header[j]
			
 
				+                    var _td = "<td>"
			
 
				+                    if(_head=="document_id"){
			
 
				+                        _td = "<td >"
			
 
				+                        table_str += _td+"<input onclick='dispatcher.post(\"setDocument\",[this.value,this.value])' class='fullwidth' style='background: #5c9ccc' type='button' value='"+_row[list_header[j]].toString()+"'/>"+"</td>";
			
 
				+                    }else{
			
 
				+                        table_str += _td+_row[list_header[j]].toString()+"</td>";
			
 
				+                    }
			
 
				+
			
 
				+                }
			
 
				+                table_str += "</tr>";
			
 
				+            }
			
 
				+            table_str +="</table>";
			
 
				+            $('#selfLabel_content').html(table_str);
			
 
				+            $('#self_whole_count').val(data.all_num);
			
 
				+            $('#self_whole_page').val(data.num_page);
			
 
				+            $('#page_num').val(data.page_num);
			
 
				+        }
			
 
				+
			
 
				+        dispatcher.
			
 
				+        on('receiveSelfLabel', receiveSelfLabel).
			
 
				+        on('clickDocument', clickDocument);
			
 
				+    };
			
 
				+
			
 
				+    return customer;
			
 
				+})(jQuery, window);
			
--- a/iepy/webui/brat/static/client/src/url_monitor.js
+++ b/iepy/webui/brat/static/client/src/url_monitor.js
@@ -59,6 +59,7 @@ var URLMonitor = (function($, window, undefined) {
 
				           dispatcher.post('docChanged', [doc, oldDoc]);
			
 
				         }
			
 
				         setArguments(args || null);
			
 
				+        dispatcher.post('getChangedOfDocument',[doc]);
			
 
				       };
			
 
				 
			
 
				       var setCollection = function(coll, doc, args) {
			
--- a/iepy/webui/brat/static/client/src/visualizer.js
+++ b/iepy/webui/brat/static/client/src/visualizer.js
@@ -1248,12 +1248,12 @@ var Visualizer = (function($, window, undefined) {
 
				               var lastChar = fragment.to - fragment.chunk.from - 1;
			
 
				 
			
 
				               // Adjust for XML whitespace (#832, #1009)
			
 
				-              var textUpToFirstChar = fragment.chunk.text.substring(0, firstChar);
			
 
				-              var textUpToLastChar = fragment.chunk.text.substring(0, lastChar);
			
 
				-              var textUpToFirstCharUnspaced = textUpToFirstChar.replace(/\s\s+/g, ' ');
			
 
				-              var textUpToLastCharUnspaced = textUpToLastChar.replace(/\s\s+/g, ' ');
			
 
				-              firstChar -= textUpToFirstChar.length - textUpToFirstCharUnspaced.length;
			
 
				-              lastChar -= textUpToLastChar.length - textUpToLastCharUnspaced.length;
			
 
				+              // var textUpToFirstChar = fragment.chunk.text.substring(0, firstChar);
			
 
				+              // var textUpToLastChar = fragment.chunk.text.substring(0, lastChar);
			
 
				+              // var textUpToFirstCharUnspaced = textUpToFirstChar.replace(/\s\s+/g, ' ');
			
 
				+              // var textUpToLastCharUnspaced = textUpToLastChar.replace(/\s\s+/g, ' ');
			
 
				+              // firstChar -= textUpToFirstChar.length - textUpToFirstCharUnspaced.length;
			
 
				+              // lastChar -= textUpToLastChar.length - textUpToLastCharUnspaced.length;
			
 
				               
			
 
				               var startPos, endPos;
			
 
				               if (firstChar < fragment.chunk.text.length) {
			
@@ -1276,9 +1276,10 @@ var Visualizer = (function($, window, undefined) {
 
				                 to: Math.max(startPos, endPos)
			
 
				               };
			
 
				             } else { // it's markedText [id, start?, char#, offset, kind]
			
 
				-              var textUpToChar = text.textContent.substring(0, fragment[2]);
			
 
				-              var textUpToCharUnspaced = textUpToChar.replace(/\s\s+/g, ' ');
			
 
				-              var pos = fragment[2] - (textUpToChar.length - textUpToCharUnspaced.length);
			
 
				+              // var textUpToChar = text.textContent.substring(0, fragment[2]);
			
 
				+              // var textUpToCharUnspaced = textUpToChar.replace(/\s\s+/g, ' ');
			
 
				+              // var pos = fragment[2] - (textUpToChar.length - textUpToCharUnspaced.length);
			
 
				+              var pos = fragment[2]
			
 
				               if (pos < 0) pos = 0;
			
 
				               if (!pos) { // start
			
 
				                 fragment[3] = text.getStartPositionOfChar(pos).x;
			
@@ -3333,6 +3334,47 @@ Util.profileStart('before render');
 
				         }
			
 
				       };
			
 
				 
			
 
				+      function clickDocument(document){
			
 
				+        dispatcher.post('ajax', [{
			
 
				+          action: 'getDocument',
			
 
				+          collection: "/",
			
 
				+          'document': document,
			
 
				+        }, 'renderData', {
			
 
				+          collection: "/",
			
 
				+          'document': document
			
 
				+        }]);
			
 
				+      }
			
 
				+      var receiveSelfLabel = function(data){
			
 
				+        var list_header = ["id","document_id","edittime","deleted","added"]
			
 
				+        var table_str = '<table class="ui-widget-content" style="width:100%">';
			
 
				+        table_str += "<tr>";
			
 
				+        for (var j in list_header){
			
 
				+          table_str += "<td>"+list_header[j]+"</td>"
			
 
				+        }
			
 
				+        table_str += "</tr>";
			
 
				+        for(var i =0;i<data.rowsData.length;i++){
			
 
				+          table_str += "<tr>";
			
 
				+          _row = data.rowsData[i];
			
 
				+          for (var j in list_header){
			
 
				+            var _head = list_header[j]
			
 
				+            var _td = "<td>"
			
 
				+            if(_head=="document_id"){
			
 
				+              _td = "<td >"
			
 
				+              table_str += _td+"<input onclick='dispatcher.post(\"setDocument\",[this.value,this.value])' class='fullwidth' style='background: #5c9ccc' type='button' value='"+_row[list_header[j]].toString()+"'/>"+"</td>";
			
 
				+            }else{
			
 
				+              table_str += _td+_row[list_header[j]].toString()+"</td>";
			
 
				+            }
			
 
				+
			
 
				+          }
			
 
				+          table_str += "</tr>";
			
 
				+        }
			
 
				+        table_str +="</table>";
			
 
				+        $('#selfLabel_content').html(table_str);
			
 
				+        $('#self_whole_count').val(data.all_num);
			
 
				+        $('#self_whole_page').val(data.num_page);
			
 
				+        $('#page_num').val(data.page_num);
			
 
				+      }
			
 
				+
			
 
				       var setAbbrevs = function(_abbrevsOn) {
			
 
				         // TODO: this is a slightly weird place to tweak the configuration
			
 
				         Configuration.abbrevsOn = _abbrevsOn;
			
@@ -3542,6 +3584,8 @@ Util.profileStart('before render');
 
				           on('textBackgrounds', setTextBackgrounds).
			
 
				           on('layoutDensity', setLayoutDensity).
			
 
				           on('svgWidth', setSvgWidth).
			
 
				+          on('receiveSelfLabel', receiveSelfLabel).
			
 
				+          on('clickDocument', clickDocument).
			
 
				           on('current', gotCurrent).
			
 
				           on('clearSVG', clearSVG).
			
 
				           on('mouseover', onMouseOver).
			
--- a/iepy/webui/brat/static/client/src/visualizer_ui.js
+++ b/iepy/webui/brat/static/client/src/visualizer_ui.js
@@ -921,6 +921,10 @@ var VisualizerUI = (function($, window, undefined) {
 
				         dispatcher.post('clearSearch');
			
 
				       });
			
 
				 
			
 
				+      $('#searchSelfLabel').click(function(evt){
			
 
				+        dispatcher.post('showForm',[selfLabel_form]);
			
 
				+      });
			
 
				+
			
 
				       $('#searchLabel').click(function(evt){
			
 
				         dispatcher.post('searchLabel');
			
 
				       });
			
@@ -1554,7 +1558,8 @@ var VisualizerUI = (function($, window, undefined) {
 
				 
			
 
				       var receiveDocument = function(data){
			
 
				         dispatcher.post('setDocument', [data.doc,data.doc]);
			
 
				-        dispatcher.post('searchLabel');
			
 
				+        // dispatcher.post('searchLabel');
			
 
				+        // dispatcher.post('getChangedOfDocument');
			
 
				       }
			
 
				       var moveDocument = function(dir){
			
 
				         dispatcher.post('allowReloadByURL')
			
@@ -1569,11 +1574,110 @@ var VisualizerUI = (function($, window, undefined) {
 
				         }]);
			
 
				       };
			
 
				 
			
 
				+      var wage_form = $('#wage_form')
			
 
				+
			
 
				+      dispatcher.post(initForm, [wage_form, {
			
 
				+        no_ok:true,
			
 
				+        no_cancel:false,
			
 
				+        resizable: false,
			
 
				+        width: 400,
			
 
				+        open: function(evt) {
			
 
				+              keymap = {};
			
 
				+        },
			
 
				+        close: function(evt) {
			
 
				+          keymap = {};
			
 
				+        }
			
 
				+      }]);
			
 
				+
			
 
				+      var selfLabel_form = $('#selfLabel_form')
			
 
				+
			
 
				+      dispatcher.post(initForm, [selfLabel_form, {
			
 
				+        no_ok:true,
			
 
				+        no_cancel:true,
			
 
				+        resizable: false,
			
 
				+        width: 700,
			
 
				+        height:630,
			
 
				+        x:100,
			
 
				+        y:0,
			
 
				+        open: function(evt) {
			
 
				+          keymap = {};
			
 
				+        },
			
 
				+        close: function(evt) {
			
 
				+          keymap = {};
			
 
				+        }
			
 
				+      }]);
			
 
				+      
			
 
				+      $('#to_search').click(function(){
			
 
				+        dispatcher.post("ajax",[{
			
 
				+          "action":"getSelfLabel",
			
 
				+          "time_begin":$('#search_begin').val(),
			
 
				+          "time_end":$('#search_end').val(),
			
 
				+          "page_num":$('#page_num').val()
			
 
				+        },'receiveSelfLabel',{
			
 
				+          
			
 
				+        }]);
			
 
				+      })
			
 
				+
			
 
				+      $('#self_turn').click(function(){
			
 
				+        dispatcher.post("ajax",[{
			
 
				+          "action":"getSelfLabel",
			
 
				+          "time_begin":$('#search_begin').val(),
			
 
				+          "time_end":$('#search_end').val(),
			
 
				+          "page_num":$('#page_num').val()
			
 
				+        },'receiveSelfLabel',{
			
 
				+
			
 
				+        }]);
			
 
				+      })
			
 
				+
			
 
				+      $('#self_last_page').click(function(){
			
 
				+        var page_num = $('#page_num').val();
			
 
				+        if(page_num!=""){
			
 
				+          page_num = parseInt(page_num);
			
 
				+          page_num -= 1;
			
 
				+        }
			
 
				+        dispatcher.post("ajax",[{
			
 
				+          "action":"getSelfLabel",
			
 
				+          "time_begin":$('#search_begin').val(),
			
 
				+          "time_end":$('#search_end').val(),
			
 
				+          "page_num":page_num,
			
 
				+        },'receiveSelfLabel',{
			
 
				+
			
 
				+        }]);
			
 
				+      })
			
 
				+
			
 
				+
			
 
				+
			
 
				+      $('#self_next_page').click(function(){
			
 
				+        var page_num = $('#page_num').val();
			
 
				+        if(page_num!=""){
			
 
				+          page_num = parseInt(page_num);
			
 
				+          page_num += 1;
			
 
				+        }
			
 
				+        dispatcher.post("ajax",[{
			
 
				+          "action":"getSelfLabel",
			
 
				+          "time_begin":$('#search_begin').val(),
			
 
				+          "time_end":$('#search_end').val(),
			
 
				+          "page_num":page_num,
			
 
				+        },'receiveSelfLabel',{
			
 
				+
			
 
				+        }]);
			
 
				+      })
			
 
				+
			
 
				+      $('#search_begin').datepicker({
			
 
				+        dateFormat: 'yy-mm-dd',
			
 
				+        inline: true
			
 
				+      });
			
 
				+      $('#search_end').datepicker({
			
 
				+        dateFormat: 'yy-mm-dd',
			
 
				+        inline: true
			
 
				+      });
			
 
				+
			
 
				       var receiveLabel = function(data){
			
 
				         $('#docCount').val(data.docCount);
			
 
				         $('#eleCount').val(data.eleCount);
			
 
				         $('#relCount').val(data.relCount);
			
 
				         $('#wage').val(Math.round(data.wage,2));
			
 
				+        dispatcher.post('showForm',[wage_form])
			
 
				       };
			
 
				       var searchLabel = function(){
			
 
				         dispatcher.post('ajax',[{
			
@@ -1583,6 +1687,19 @@ var VisualizerUI = (function($, window, undefined) {
 
				         }]);
			
 
				       };
			
 
				 
			
 
				+      var receiveChangedOfD = function(data){
			
 
				+        $('#document_deleted').val(data.deleted);
			
 
				+        $('#document_added').val(data["added"])
			
 
				+      };
			
 
				+
			
 
				+      var getChangedOfDocument = function(_doc){
			
 
				+        dispatcher.post('ajax',[{
			
 
				+          action:'getChangedOfDocument',
			
 
				+          'document':_doc
			
 
				+        },'receiveChangedOfD',{
			
 
				+
			
 
				+        }]);
			
 
				+      };
			
 
				 
			
 
				 
			
 
				       var moveInFileBrowser = function(dir) {
			
@@ -1824,7 +1941,7 @@ var VisualizerUI = (function($, window, undefined) {
 
				           $cmpLink.button();
			
 
				         }
			
 
				           
			
 
				-        $docName = $('#document_name input').val(coll + doc);
			
 
				+        $docName = $('#document_id').val(coll + doc);
			
 
				         var docName = $docName[0];
			
 
				         // TODO do this on resize, as well
			
 
				         // scroll the document name to the right, so the name is visible
			
@@ -1962,6 +2079,8 @@ var VisualizerUI = (function($, window, undefined) {
 
				         showForm(aboutDialog);
			
 
				       });
			
 
				 
			
 
				+
			
 
				+
			
 
				       // TODO: copy from annotator_ui; DRY it up
			
 
				       var adjustFormToCursor = function(evt, element) {
			
 
				         var screenHeight = $(window).height() - 8; // TODO HACK - no idea why -8 is needed
			
@@ -2408,7 +2527,9 @@ var VisualizerUI = (function($, window, undefined) {
 
				           on('searchResultsReceived', searchResultsReceived).
			
 
				           on('clearSearch', clearSearch).
			
 
				           on('searchLabel',searchLabel).
			
 
				+          on('getChangedOfDocument',getChangedOfDocument).
			
 
				           on('receiveLabel',receiveLabel).
			
 
				+          on('receiveChangedOfD',receiveChangedOfD).
			
 
				           on('clearSVG', showNoDocMessage).
			
 
				           on('screamingHalt', onScreamingHalt).
			
 
				           on('configurationChanged', configurationChanged).
			
--- a/iepy/webui/brat/static/style-ui.css
+++ b/iepy/webui/brat/static/style-ui.css
@@ -303,7 +303,7 @@ div.scroll_wrapper_upper div.scroller {
 
				   margin-right: 100px;
			
 
				 }
			
 
				 #document_name input {
			
 
				-  width: 100%;
			
 
				+  width: 20%;
			
 
				   border: none;
			
 
				 }
			
 
				 
			
--- a/iepy/webui/brat/templates/brat/index.html
+++ b/iepy/webui/brat/templates/brat/index.html
@@ -34,6 +34,7 @@ head.js(
 
				         "{% static 'client/src/visualizer_ui.js' %}",
			
 
				         "{% static 'client/src/annotator_ui.js' %}",
			
 
				         "{% static 'client/src/spinner.js' %}",
			
 
				+    "{% static 'client/src/custom.js' %}",
			
 
				 );
			
 
				 head.ready(function() {
			
 
				   // var dispatcher = new Dispatcher(); // XXX DEBUG
			
@@ -45,6 +46,7 @@ head.ready(function() {
 
				   var visualizerUI = new VisualizerUI(dispatcher, svg);
			
 
				   var annotatorUI = new AnnotatorUI(dispatcher, svg);
			
 
				   var spinner = new Spinner(dispatcher, '#spinner');
			
 
				+  var customer = new customer(dispatcher,svg);
			
 
				   var logger = new AnnotationLog(dispatcher);
			
 
				   // Util.profileEnable();
			
 
				   dispatcher.post('init');
			
@@ -116,8 +118,13 @@ njsw.parentNode.removeChild(njsw);
 
				           <img id="prev" alt="Previous document (Cursor Left)" title="Previous document (Cursor Left)" src="{% static 'img/arrow-180.png' %}"/><img id="next" alt="Next document (Cursor Right)" title="Next document (Cursor Right)" src="{% static 'img/arrow.png' %}"/>
			
 
				         </span>
			
 
				   <div id="document_name">
			
 
				-    <input readonly="readonly" class="ui-widget-header"></input>
			
 
				+    <input id='document_id' readonly="readonly" class="ui-widget-header"></input>
			
 
				+          deleted:
			
 
				+          <input id='document_deleted' readonly="readonly" class="ui-widget-header"></input>
			
 
				+          added:
			
 
				+          <input id='document_added' readonly="readonly" class="ui-widget-header"></input>
			
 
				   </div>
			
 
				+
			
 
				   <!-- <span id="document_ctime"/> -->
			
 
				 </div>
			
 
				 <div id="pulldown" class="unselectable ui-widget-content">
			
@@ -126,22 +133,17 @@ njsw.parentNode.removeChild(njsw);
 
				 
			
 
				 <!--      <input id="auth_button" type="button" value="Login"/>-->
			
 
				         <input id="manage" type="button" value="manage" onclick="window.open(url='/admin')"/>
			
 
				-
			
 
				-        <input id="searchLabel" type="button" value="searchLabel"/>
			
 
				-        <input type="text" value="文章数" disabled="disabled" style="width:10%"/>
			
 
				-        <input id="docCount" type="text" disabled="disabled" style="width:10%"/>
			
 
				-        <input type="text" value="要素数"  disabled="disabled" style="width:10%"/>
			
 
				-        <input id="eleCount" type="text" disabled="disabled" style="width:10%"/>
			
 
				-        <input type="text" value="关系数" disabled="disabled" style="width:10%"/>
			
 
				-        <input id="relCount" type="text" disabled="disabled" style="width:10%"/>
			
 
				-        <input type="text" value="合格结算价" disabled="disabled" style="width:10%"/>
			
 
				-        <input id="wage" type="text" disabled="disabled" style="width:10%"/>
			
 
				+        <input id="searchSelfLabel" type="button" value="search"/>
			
 
				+<!--        <input id="search_button" type="checkbox" class="login" value="true" tabindex="-1"/>-->
			
 
				+<!--        <label id="search_button_label" for="search_button" title="Search text and annotations">Search</label>-->
			
 
				+<!--        <input id="clear_search_button" type="button" value="✕" tabindex="-1" title="Clear search" style="display: none"/>-->
			
 
				+        <input id="searchLabel" type="button" value="wage"/>
			
 
				 <!--      {#            <input id="collection_browser_button" type="button" value="Collection" tabindex="-1" title="Open Collection Browser (Tab)"/>#}-->
			
 
				 <!--      {#            <input id="data_button" type="button" value="Data" tabindex="-1" title="Import, Export Data; Manage Collection"/>#}-->
			
 
				-<!--      {#            <input id="search_button" type="checkbox" class="login" value="true" tabindex="-1"/><label id="search_button_label" for="search_button" title="Search text and annotations">Search</label><input id="clear_search_button" type="button" value="✕" tabindex="-1" title="Clear search" style="display: none"/>#}-->
			
 
				-<!--      {#            <input id="unlock_type_button" type="button" value="Unlock" tabindex="-1" title="Stop annotating with the locked type"/>#}-->
			
 
				+
			
 
				+      <input id="unlock_type_button" type="button" value="Unlock" title="Stop annotating with the locked type"/>
			
 
				       <!--<input id="undo_button" type="button" class="login" value="Undo" tabindex="-1"/>-->
			
 
				-<!--      {#            <input id="options_button" type="button" value="Options" tabindex="-1" title="Set Visual, Annotation and Network Options"/>#}-->
			
 
				+      <input id="options_button" type="button" value="Options" tabindex="-1" title="Set Visual, Annotation and Network Options"/>
			
 
				     </div>
			
 
				     <!-- Dummy span, fixes visual glitch (issue #535). TODO: better fix -->
			
 
				     <span class="document_edit_time unselectable">&nbsp;</span>
			
@@ -303,28 +305,28 @@ njsw.parentNode.removeChild(njsw);
 
				   </div>
			
 
				   <div class="optionRow">
			
 
				     <span class="optionLabel">Visualization width</span>
			
 
				-    <input id="svg_width_value" maxlength="3" size="3" value="100"
			
 
				+    <input id="svg_width_value" maxlength="3" size="3" value="100" onblur="if(this.value=='' || parseInt(this.value)<20){this.value='20';}"
			
 
				            style="text-align:right"/>
			
 
				     <span id="svg_width_unit" class="radio_group small-buttons">
			
 
				             <input type="radio" id="svg_width_unit_percent" value="%"
			
 
				                    name="svg_width_radio" checked="checked"/>
			
 
				             <label for="svg_width_unit_percent">percent</label>
			
 
				-            <input type="radio" id="svg_width_unit_pixels" value="px"
			
 
				-                   name="svg_width_radio"/>
			
 
				-            <label for="svg_width_unit_pixels">pixels</label>
			
 
				+<!--            <input type="radio" id="svg_width_unit_pixels" value="px"-->
			
 
				+<!--                   name="svg_width_radio"/>-->
			
 
				+<!--            <label for="svg_width_unit_pixels">pixels</label>-->
			
 
				           </span>
			
 
				   </div>
			
 
				-  <div class="optionRow small-buttons">
			
 
				-    <span class="optionLabel">Paging</span>
			
 
				-    size
			
 
				-    <input id="paging_size" maxlength="3" size="3" value="10"
			
 
				-           style="text-align:right"/>,
			
 
				-    step
			
 
				-    <input id="paging_step" maxlength="3" size="3" value="5"
			
 
				-           style="text-align:right"/>
			
 
				-    sentences
			
 
				-    <input type="button" id="paging_clear" value="Clear"/>
			
 
				-  </div>
			
 
				+<!--  <div class="optionRow small-buttons">-->
			
 
				+<!--    <span class="optionLabel">Paging</span>-->
			
 
				+<!--    size-->
			
 
				+<!--    <input id="paging_size" maxlength="3" size="3" value="10"-->
			
 
				+<!--           style="text-align:right"/>,-->
			
 
				+<!--    step-->
			
 
				+<!--    <input id="paging_step" maxlength="3" size="3" value="5"-->
			
 
				+<!--           style="text-align:right"/>-->
			
 
				+<!--    sentences-->
			
 
				+<!--    <input type="button" id="paging_clear" value="Clear"/>-->
			
 
				+<!--  </div>-->
			
 
				 </fieldset>
			
 
				 <fieldset id="options_form_annotation" class="login">
			
 
				   <legend>Annotation options</legend>
			
@@ -352,7 +354,7 @@ njsw.parentNode.removeChild(njsw);
 
				            style="text-align:right"/>
			
 
				   </div>
			
 
				 </fieldset>
			
 
				-<fieldset id="options_form_network">
			
 
				+<fieldset id="options_form_network" style="display:none">
			
 
				   <legend>Network options</legend>
			
 
				   <div class="optionRow">
			
 
				     <span class="optionLabel">Collaboration</span>
			
@@ -369,6 +371,87 @@ njsw.parentNode.removeChild(njsw);
 
				 <fieldset>
			
 
				   <textarea id="more_info_readme" readonly="readonly" class="borderless"></textarea>
			
 
				 </fieldset>
			
 
				+</form>
			
 
				+
			
 
				+<form id="wage_form" class="dialog" title="wage">
			
 
				+    <div>
			
 
				+        <table class="fullwidth">
			
 
				+            <tr>
			
 
				+                <td>
			
 
				+                    <input type="text" class="fullwidth" value="文章数" disabled="disabled" style="background: #5c9ccc"/>
			
 
				+                </td>
			
 
				+                <td>
			
 
				+                    <input id="docCount" class="fullwidth" type="text" disabled="disabled" style="background: #5c9ccc"/>
			
 
				+                </td>
			
 
				+            </tr>
			
 
				+            <tr>
			
 
				+                <td>
			
 
				+                    <input type="text" class="fullwidth" value="要素数"  disabled="disabled" style="background: #5c9ccc"/>
			
 
				+                </td>
			
 
				+                <td>
			
 
				+                    <input id="eleCount" class="fullwidth" type="text" disabled="disabled" style="background: #5c9ccc"/>
			
 
				+                </td>
			
 
				+            </tr >
			
 
				+            <tr>
			
 
				+                <td>
			
 
				+                    <input type="text" class="fullwidth" value="关系数" disabled="disabled"  style="background: #5c9ccc"/>
			
 
				+                </td>
			
 
				+                <td>
			
 
				+                    <input id="relCount" class="fullwidth" type="text" disabled="disabled" style="background: #5c9ccc"/>
			
 
				+                </td>
			
 
				+            </tr>
			
 
				+            <tr >
			
 
				+                <td >
			
 
				+                    <input type="text" class="fullwidth" value="合格结算价" disabled="disabled" style="background: #5c9ccc"/>
			
 
				+                </td>
			
 
				+                <td >
			
 
				+                    <input id="wage" class="fullwidth" type="text" disabled="disabled" style="background: #5c9ccc"/>
			
 
				+                </td>
			
 
				+            </tr>
			
 
				+        </table>
			
 
				+    </div>
			
 
				+</form>
			
 
				+
			
 
				+
			
 
				+<form id="selfLabel_form" class="dialog" title="selfLabel">
			
 
				+    <div class="fullwidth" >
			
 
				+        <table class="ui-widget-header" style="width:100%">
			
 
				+            <tr>
			
 
				+                <td >
			
 
				+                    <input type="text" class="fullwidth" value="开始时间" readonly="readonly" style="background: #5c9ccc"/>
			
 
				+                </td>
			
 
				+                <td >
			
 
				+                    <input id="search_begin" type="text"/>
			
 
				+                </td >
			
 
				+                <td >
			
 
				+                    <input type="text" class="fullwidth" value="结束时间" readonly="readonly" style="background: #5c9ccc"/>
			
 
				+                </td >
			
 
				+                <td >
			
 
				+                    <input id="search_end" type="text"/>
			
 
				+                </td>
			
 
				+                <td >
			
 
				+                    <input id="to_search" type="button" class="fullwidth" value="查询"/>
			
 
				+                </td>
			
 
				+            </tr>
			
 
				+        </table>
			
 
				+    </div>
			
 
				+    <div id="selfLabel_content" class="fullwidth" style="height:500px;overflow: scroll">
			
 
				+
			
 
				+    </div>
			
 
				+    <div class="ui-widget-header" style="width:100%">
			
 
				+        <tr>
			
 
				+            <td style="width:60%">
			
 
				+                记录共计<input id="self_whole_count" readonly="readonly" size="5"/>条，
			
 
				+                <input id="self_whole_page" readonly="readonly" size="5"/>页
			
 
				+            </td>
			
 
				+            <td style="width:40%">
			
 
				+                <input id="self_last_page" type="button" value="<"/>
			
 
				+                <input id="page_num" type="number" size="5"/>
			
 
				+                <input id="self_next_page" type="button" value=">"/>
			
 
				+                <input id="self_turn" type="button" value="转到"/>
			
 
				+            </td>
			
 
				+        </tr>
			
 
				+    </div>
			
 
				 </form>
			
 
				         <!-- Search dialog -->
			
 
				 <form id="search_form" class="dialog" title="Search">
			
@@ -565,7 +648,7 @@ njsw.parentNode.removeChild(njsw);
 
				           </span>
			
 
				     </div>
			
 
				     <div id="context_size_div" class="optionRow">
			
 
				-      <span class="optionLabel" style="margin-left:1em;">Context length</span> <input id="context_length" maxlength="3" size="3" value="50"/> characters
			
 
				+      <span class="optionLabel" style="margin-left:1em;">Context length</span> <input id="context_length" maxlength="3" minlenght="2" size="3" value="50"/> characters
			
 
				     </div>
			
 
				     <div class="optionRow">
			
 
				       <span class="optionLabel">Match text as</span>
			
--- a/iepy/webui/brat/views.py
+++ b/iepy/webui/brat/views.py
@@ -87,8 +87,8 @@ def ajax_dispatch(request):
 
				         from sys import path as sys_path
			
 
				         log_critical('Heisenbug trap reports: ' + str(sys_path))
			
 
				         raise
			
 
				-    client_ip = request.META["REMOTE_ADDR"]
			
 
				-    client_hostname = request.META["REMOTE_HOST"]
			
 
				+    client_ip = request.META.get("REMOTE_ADDR",'')
			
 
				+    client_hostname = request.META.get("REMOTE_HOST",'')
			
 
				     # init_session(client_ip, cookie_data=cookie_data)
			
 
				     # init_session_iepy(request)
			
 
				     response_is_JSON = True