3 жил өмнө · f4ba43bd11
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -265,7 +265,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
															                                 begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
														
 
															                                 end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
														
 
															                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
														
 
															-                                add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
														
 
															+                                add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"],in_attachment=p_sentence.in_attachment)
														
 
															                                 add_entity.if_dict_match = 1
														
 
															                                 list_entity.append(add_entity)
														
@@ -330,7 +330,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
															                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
														
 
															                     end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
														
 
															                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
														
 
															-                    add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
														
 
															+                    add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"],in_attachment=p_sentence.in_attachment)
														
 
															                     list_entity.append(add_entity)
														
 
															                     range_entity.append(add_entity)
														
--- a/BiddingKG/dl/interface/Entitys.py
+++ b/BiddingKG/dl/interface/Entitys.py
@@ -93,7 +93,7 @@ class Sentences():
 
															     @summary:句子类
														
 
															     '''
														
 
															-    def __init__(self,doc_id,sentence_index,sentence_text,tokens,pos_tags,ner_tags):
														
 
															+    def __init__(self,doc_id,sentence_index,sentence_text,tokens,pos_tags,ner_tags,in_attachment=False):
														
 
															         '''
														
 
															         @param:
														
 
															             doc_id:文章的uuid
														
@@ -110,6 +110,7 @@ class Sentences():
 
															         self.tokens = tokens
														
 
															         self.pos_tags = pos_tags
														
 
															         self.ner_tags = ner_tags
														
 
															+        self.in_attachment = in_attachment # 2022/02/10添加，句子是否在附件中
														
 
															     def toJson(self):
														
 
															         _dict = {"doc_id":self.doc_id,"sentence_index":self.sentence_index,"sentence_text":self.sentence_text,
														
@@ -123,12 +124,38 @@ class Sentences():
 
															                          _dict.get("pos_tags"),_dict.get("ner_tags"))
														
 
															+class Outline():
														
 
															+    '''
														
 
															+    @summary:根据正则划分公告的大纲
														
 
															+    '''
														
 
															+
														
 
															+    def __init__(self, doc_id,outline_index,outline_text, sentence_begin_index, sentence_end_index,wordOffset_begin,wordOffset_end):
														
 
															+        '''
														
 
															+        @param:
														
 
															+            doc_id:文章的uuid
														
 
															+            sentence_begin_index:开始的句子索引
														
 
															+            sentence_end_index:结束的句子索引
														
 
															+            wordOffset_begin:开始句子的起始字索引
														
 
															+            wordOffset_end:结束句子的结尾字索引
														
 
															+            outline_text:大纲的全文本
														
 
															+            outline_summary:大纲的概要
														
 
															+        '''
														
 
															+        self.doc_id = doc_id
														
 
															+        self.outline_index = outline_index
														
 
															+        self.outline_text = outline_text
														
 
															+        self.sentence_begin_index = sentence_begin_index
														
 
															+        self.sentence_end_index = sentence_end_index
														
 
															+        self.wordOffset_begin = wordOffset_begin
														
 
															+        self.wordOffset_end = wordOffset_end
														
 
															+        self.outline_summary = ""
														
 
															+
														
 
															+
														
 
															 class Entity():
														
 
															     '''
														
 
															     @summary:实体类
														
 
															     '''
														
 
															-    def __init__(self,doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,wordOffset_begin=None,wordOffset_end=None,label=None,values=None,person_phone=None):
														
 
															+    def __init__(self,doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,wordOffset_begin=None,wordOffset_end=None,label=None,values=None,person_phone=None,in_attachment=False):
														
 
															         '''
														
 
															         @param:
														
 
															             doc_id:文章的uuid
														
@@ -175,6 +202,7 @@ class Entity():
 
															         self.pointer_serviceTime = None  # 2022/01/05 新增，中标人对应链接"服务期限(工期)"
														
 
															         self.pointer_ratio = None  # 2022/01/05 新增，中标人对应链接"中投标金额->费率、下浮率"
														
 
															         self.origin_entity_text = ''  # 2022/1/5 新增，记录字典替换的原来的实体名
														
 
															+        self.in_attachment = in_attachment  # 2022/02/10添加，实体是否在附件中
														
 
															     def set_Role(self,role_label,role_values):
														
 
															         self.label = int(role_label)
														
@@ -272,7 +300,25 @@ class Role():
 
															         self.linklist = [item for item in set(self.linklist)]
														
 
															         # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
														
 
															         # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit]
														
 
															-        result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit,self.ratio,self.serviceTime]
														
 
															+        # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit,self.ratio,self.serviceTime]
														
 
															+        floating_ratio = "" # 上浮率
														
 
															+        downward_floating_ratio = "" # 下浮率
														
 
															+        discount_ratio = "" # 折扣率/费率
														
 
															+        if self.ratio:
														
 
															+            num_value = re.search("[\d\.]+",self.ratio).group()
														
 
															+            num_value = float(num_value)
														
 
															+            if re.search("%|百分之",self.ratio):
														
 
															+                num_value = num_value / 100
														
 
															+            num_value = str('%.4f'%(num_value))
														
 
															+            if re.search("上浮",self.ratio):
														
 
															+                floating_ratio = num_value
														
 
															+            elif re.search("下浮",self.ratio):
														
 
															+                downward_floating_ratio = num_value
														
 
															+            else:
														
 
															+                discount_ratio = num_value
														
 
															+        result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
														
 
															+                  'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
														
 
															+                  'linklist': self.linklist,'serviceTime':self.serviceTime}
														
 
															         return result
														
 
															 # 用于KM算法的组合配对
														
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1019,6 +1019,112 @@ def tableToText(soup):
 
															     return soup
														
 
															     # return list_innerTable
														
 
															+re_num = re.compile("[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十]")
														
 
															+num_dict = {
														
 
															+    "一": 1, "二": 2,
														
 
															+    "三": 3, "四": 4,
														
 
															+    "五": 5, "六": 6,
														
 
															+    "七": 7, "八": 8,
														
 
															+    "九": 9, "十": 10}
														
 
															+# 一百以内的中文大写转换为数字
														
 
															+def change2num(text):
														
 
															+    result_num = -1
														
 
															+    # text = text[:6]
														
 
															+    match = re_num.search(text)
														
 
															+    if match:
														
 
															+        _num = match.group()
														
 
															+        if num_dict.get(_num):
														
 
															+            return num_dict.get(_num)
														
 
															+        else:
														
 
															+            tenths = 1
														
 
															+            the_unit = 0
														
 
															+            num_split = _num.split("十")
														
 
															+            if num_dict.get(num_split[0]):
														
 
															+                tenths = num_dict.get(num_split[0])
														
 
															+            if num_dict.get(num_split[1]):
														
 
															+                the_unit = num_dict.get(num_split[1])
														
 
															+            result_num = tenths * 10 + the_unit
														
 
															+    elif re.search("\d{1,2}",text):
														
 
															+        _num = re.search("\d{1,2}",text).group()
														
 
															+        result_num = int(_num)
														
 
															+    return result_num
														
 
															+#大纲分段处理
														
 
															+def get_preprocessed_outline(soup):
														
 
															+    pattern_0 = re.compile("^(?:[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十])[、．\.]")
														
 
															+    pattern_1 = re.compile("^[\(（]?(?:[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十])[\)）]")
														
 
															+    pattern_2 = re.compile("^\d{1,2}[、．\.](?=[^\d]{1,2}|$)")
														
 
															+    pattern_3 = re.compile("^[\(（]?\d{1,2}[\)）]")
														
 
															+    pattern_list = [pattern_0, pattern_1, pattern_2, pattern_3]
														
 
															+
														
 
															+    body = soup.find("body")
														
 
															+    body_child = body.find_all(recursive=False)
														
 
															+    deal_part = body
														
 
															+    # print(body_child[0]['id'])
														
 
															+    if 'id' in body_child[0].attrs:
														
 
															+        if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
														
 
															+            deal_part = body_child[0]
														
 
															+    if len(deal_part.find_all(recursive=False))>2:
														
 
															+        deal_part = deal_part.parent
														
 
															+    skip_tag = ['turntable', 'tbody', 'th', 'tr', 'td', 'table','<thead>','<tfoot>']
														
 
															+    for part in deal_part.find_all(recursive=False):
														
 
															+        # 查找解析文本的主干部分
														
 
															+        is_main_text = False
														
 
															+        through_text_num = 0
														
 
															+        while (not is_main_text and part.find_all(recursive=False)):
														
 
															+            while len(part.find_all(recursive=False)) == 1 and part.get_text(strip=True) == \
														
 
															+                    part.find_all(recursive=False)[0].get_text(strip=True):
														
 
															+                part = part.find_all(recursive=False)[0]
														
 
															+            max_len = len(part.get_text(strip=True))
														
 
															+            is_main_text = True
														
 
															+            for t_part in part.find_all(recursive=False):
														
 
															+                if t_part.name not in skip_tag and t_part.get_text(strip=True)!="":
														
 
															+                    through_text_num += 1
														
 
															+                if t_part.get_text(strip=True)!="" and len(t_part.get_text(strip=True))/max_len>=0.65:
														
 
															+                    if t_part.name not in skip_tag:
														
 
															+                        is_main_text = False
														
 
															+                        part = t_part
														
 
															+                        break
														
 
															+                    else:
														
 
															+                        while len(t_part.find_all(recursive=False)) == 1 and t_part.get_text(strip=True) == \
														
 
															+                                t_part.find_all(recursive=False)[0].get_text(strip=True):
														
 
															+                            t_part = t_part.find_all(recursive=False)[0]
														
 
															+                        if through_text_num>2:
														
 
															+                            is_table = True
														
 
															+                            for _t_part in t_part.find_all(recursive=False):
														
 
															+                                if _t_part.name not in skip_tag:
														
 
															+                                    is_table = False
														
 
															+                                    break
														
 
															+                            if not is_table:
														
 
															+                                is_main_text = False
														
 
															+                                part = t_part
														
 
															+                                break
														
 
															+                        else:
														
 
															+                            is_main_text = False
														
 
															+                            part = t_part
														
 
															+                            break
														
 
															+        is_find = False
														
 
															+        for _pattern in pattern_list:
														
 
															+            last_index = 0
														
 
															+            handle_list = []
														
 
															+            for _part in part.find_all(recursive=False):
														
 
															+                if _part.name not in skip_tag and _part.get_text(strip=True) != "":
														
 
															+                    # print('text:', _part.get_text(strip=True))
														
 
															+                    re_match = re.search(_pattern, _part.get_text(strip=True))
														
 
															+                    if re_match:
														
 
															+                        outline_index = change2num(re_match.group())
														
 
															+                        if last_index < outline_index:
														
 
															+                            # _part.insert_before("##split##")
														
 
															+                            handle_list.append(_part)
														
 
															+                            last_index = outline_index
														
 
															+            if len(handle_list)>1:
														
 
															+                is_find = True
														
 
															+                for _part in handle_list:
														
 
															+                    _part.insert_before("##split##")
														
 
															+            if is_find:
														
 
															+                break
														
 
															+    # print(soup)
														
 
															+    return soup
														
 
															+
														
 
															 #数据清洗
														
 
															 def segment(soup,final=True):
														
 
															     # print("==")
														
@@ -1071,6 +1177,11 @@ def segment(soup,final=True):
 
															             child.insert_after("。")
														
 
															         if child.name in commaList:
														
 
															             child.insert_after("，")
														
 
															+        if child.name == 'div' and 'class' in child.attrs:
														
 
															+            # 添加附件"attachment"标识
														
 
															+            if "richTextFetch" in child['class']:
														
 
															+                child.insert_before("##attachment##")
														
 
															+                # print(child.parent)
														
 
															         # if child.name in subspaceList:
														
 
															         #     child.insert_before("#subs"+str(child.name)+"#")
														
 
															         #     child.insert_after("#sube"+str(child.name)+"#")
														
@@ -1164,7 +1275,8 @@ def segment(soup,final=True):
 
															             _text += re.sub("）",")",re.sub("（","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
														
 
															             LOOP_BEGIN += LOOP_LEN
														
 
															         text = _text
														
 
															-
														
 
															+    # 附件标识前修改为句号，避免正文和附件内容混合在一起
														
 
															+    text = re.sub("[^。](?=##attachment##)","。",text)
														
 
															     return text
														
 
															 '''
														
@@ -1467,12 +1579,12 @@ def get_preprocessed(articles,useselffool=False):
 
															     '''
														
 
															     cost_time = dict()
														
 
															     list_articles = get_preprocessed_article(articles,cost_time)
														
 
															-    list_sentences = get_preprocessed_sentences(list_articles,True,cost_time)
														
 
															+    list_sentences,list_outlines = get_preprocessed_sentences(list_articles,True,cost_time)
														
 
															     list_entitys = get_preprocessed_entitys(list_sentences,True,cost_time)
														
 
															     calibrateEnterprise(list_articles,list_sentences,list_entitys)
														
 
															-    return list_articles,list_sentences,list_entitys,cost_time
														
 
															+    return list_articles,list_sentences,list_entitys,list_outlines,cost_time
														
 
															 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
														
@@ -1486,13 +1598,36 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
															         doc_id = article[0]
														
 
															         sourceContent = article[1]
														
 
															         sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
														
 
															+
														
 
															+        sourceContent = sourceContent.replace('<br/>', '<br>')
														
 
															+        sourceContent = re.sub("<br>(\s{0,}<br>)+","<br>",sourceContent)
														
 
															+        for br_match in re.findall("[^>]+?<br>",sourceContent):
														
 
															+            _new = re.sub("<br>","",br_match)
														
 
															+            # <br>标签替换为<p>标签
														
 
															+            if not re.search("^\s+$",_new):
														
 
															+                _new = '<p>'+_new + '</p>'
														
 
															+                # print(br_match,_new)
														
 
															+                sourceContent = sourceContent.replace(br_match,_new,1)
														
 
															+
														
 
															         _send_doc_id = article[3]
														
 
															         _title = article[4]
														
 
															         page_time = article[5]
														
 
															         #表格处理
														
 
															         key_preprocess = "tableToText"
														
 
															         start_time = time.time()
														
 
															-        article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
														
 
															+        # article_processed = tableToText(BeautifulSoup(sourceContent,"lxml"))
														
 
															+        article_processed = BeautifulSoup(sourceContent,"lxml")
														
 
															+        # article_processed = preprocessed_html(article_processed,"")
														
 
															+        for _soup in article_processed.descendants:
														
 
															+            # 识别无标签文本，添加<p>标签
														
 
															+            if not _soup.name and not _soup.parent.string and _soup.string.strip()!="":
														
 
															+                # print(_soup.parent.string,_soup.string.strip())
														
 
															+                _soup.wrap(article_processed.new_tag("p"))
														
 
															+        # print(article_processed)
														
 
															+        article_processed = get_preprocessed_outline(article_processed)
														
 
															+        article_processed = tableToText(article_processed)
														
 
															+        # print(article_processed)
														
 
															+        article_processed = segment(article_processed)
														
 
															         article_processed = article_processed.replace('．','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
														
 
															         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
														
 
															         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
														
@@ -1547,6 +1682,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
															     :return: list_sentences
														
 
															     '''
														
 
															     list_sentences = []
														
 
															+    list_outlines = []
														
 
															     for article in list_articles:
														
 
															         list_sentences_temp = []
														
 
															         list_entitys_temp = []
														
@@ -1557,7 +1693,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
															         key_preprocess = "tableToText"
														
 
															         start_time = time.time()
														
 
															         article_processed = article.content
														
 
															-
														
 
															+        attachment_begin_index = -1
														
 
															         if key_preprocess not in cost_time:
														
 
															             cost_time[key_preprocess] = 0
														
@@ -1572,13 +1708,66 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
															             for _iter in re.finditer(split_patten,article_processed):
														
 
															                 _sen = article_processed[_begin:_iter.span()[1]]
														
 
															                 if len(_sen)>0 and _sen not in sentences_set:
														
 
															+                    # 标识在附件里的句子
														
 
															+                    if re.search("##attachment##",_sen):
														
 
															+                        attachment_begin_index = len(sentences)
														
 
															+                        # _sen = re.sub("##attachment##","",_sen)
														
 
															                     sentences.append(_sen)
														
 
															                     sentences_set.add(_sen)
														
 
															                 _begin = _iter.span()[1]
														
 
															             _sen = article_processed[_begin:]
														
 
															+            if re.search("##attachment##", _sen):
														
 
															+                # _sen = re.sub("##attachment##", "", _sen)
														
 
															+                attachment_begin_index = len(sentences)
														
 
															             if len(_sen)>0 and _sen not in sentences_set:
														
 
															                 sentences.append(_sen)
														
 
															                 sentences_set.add(_sen)
														
 
															+            # 解析outline大纲分段
														
 
															+            outline_list = []
														
 
															+            if re.search("##split##",article.content):
														
 
															+                temp_sentences = []
														
 
															+                last_sentence_index = (-1,-1)
														
 
															+                outline_index = 0
														
 
															+                for sentence_index in range(len(sentences)):
														
 
															+                    sentence_text = sentences[sentence_index]
														
 
															+                    for _ in re.findall("##split##", sentence_text):
														
 
															+                        _match = re.search("##split##", sentence_text)
														
 
															+                        if last_sentence_index[0] > -1:
														
 
															+                            sentence_begin_index,wordOffset_begin = last_sentence_index
														
 
															+                            sentence_end_index = sentence_index
														
 
															+                            wordOffset_end = _match.start()
														
 
															+                            if sentence_begin_index<attachment_begin_index and sentence_end_index>=attachment_begin_index:
														
 
															+                                outline_list.append(Outline(doc_id,outline_index,'',sentence_begin_index,attachment_begin_index-1,wordOffset_begin,len(sentences[attachment_begin_index-1])))
														
 
															+                            else:
														
 
															+                                outline_list.append(Outline(doc_id,outline_index,'',sentence_begin_index,sentence_end_index,wordOffset_begin,wordOffset_end))
														
 
															+                            outline_index += 1
														
 
															+                        sentence_text = re.sub("##split##", "", sentence_text,count=1)
														
 
															+                        last_sentence_index = (sentence_index,_match.start())
														
 
															+                    temp_sentences.append(sentence_text)
														
 
															+                if attachment_begin_index>-1 and last_sentence_index[0]<attachment_begin_index:
														
 
															+                    outline_list.append(Outline(doc_id,outline_index,'',last_sentence_index[0],attachment_begin_index-1,last_sentence_index[1],len(temp_sentences[attachment_begin_index-1])))
														
 
															+                else:
														
 
															+                    outline_list.append(Outline(doc_id,outline_index,'',last_sentence_index[0],len(sentences)-1,last_sentence_index[1],len(temp_sentences[-1])))
														
 
															+                sentences = temp_sentences
														
 
															+            #解析outline的outline_text内容
														
 
															+            for _outline in outline_list:
														
 
															+                if _outline.sentence_begin_index==_outline.sentence_end_index:
														
 
															+                    _text = sentences[_outline.sentence_begin_index][_outline.wordOffset_begin:_outline.wordOffset_end]
														
 
															+                else:
														
 
															+                    _text = ""
														
 
															+                    for idx in range(_outline.sentence_begin_index,_outline.sentence_end_index+1):
														
 
															+                        if idx==_outline.sentence_begin_index:
														
 
															+                            _text += sentences[idx][_outline.wordOffset_begin:]
														
 
															+                        elif idx==_outline.sentence_end_index:
														
 
															+                            _text += sentences[idx][:_outline.wordOffset_end]
														
 
															+                        else:
														
 
															+                            _text += sentences[idx]
														
 
															+                _outline.outline_text = _text
														
 
															+                _outline_summary = re.split("[:：，]",_text,1)[0]
														
 
															+                if len(_outline_summary)<20:
														
 
															+                    _outline.outline_summary = _outline_summary
														
 
															+                # print(_outline.outline_index,_outline.outline_text)
														
 
															+
														
 
															             article.content = "".join(sentences)
														
 
															             # sentences.append(article_processed[_begin:])
														
@@ -1603,9 +1792,10 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
															                 cost_time[key_nerToken] = 0
														
 
															             cost_time[key_nerToken] += round(time.time()-start_time,2)
														
 
															-
														
 
															+            in_attachment = False
														
 
															             for sentence_index in range(len(sentences)):
														
 
															-
														
 
															+                if sentence_index == attachment_begin_index:
														
 
															+                    in_attachment = True
														
 
															                 sentence_text = sentences[sentence_index]
														
 
															                 tokens = tokens_all[sentence_index]
														
@@ -1614,12 +1804,12 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
															                 ner_entitys = ""
														
 
															-                list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=sentence_index,sentence_text=sentence_text,tokens=tokens,pos_tags=pos_tag,ner_tags=ner_entitys))
														
 
															-
														
 
															+                list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=sentence_index,sentence_text=sentence_text,tokens=tokens,pos_tags=pos_tag,ner_tags=ner_entitys,in_attachment=in_attachment))
														
 
															         if len(list_sentences_temp)==0:
														
 
															             list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=0,sentence_text="sentence_text",tokens=[],pos_tags=[],ner_tags=""))
														
 
															         list_sentences.append(list_sentences_temp)
														
 
															-    return list_sentences
														
 
															+        list_outlines.append(outline_list)
														
 
															+    return list_sentences,list_outlines
														
 
															 def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
														
 
															     '''
														
@@ -1666,6 +1856,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															             sentence_text = list_sentence[sentence_index].sentence_text
														
 
															             tokens = list_sentence[sentence_index].tokens
														
 
															             doc_id = list_sentence[sentence_index].doc_id
														
 
															+            in_attachment = list_sentence[sentence_index].in_attachment
														
 
															             list_tokenbegin = []
														
 
															             begin = 0
														
 
															             for i in range(0,len(tokens)):
														
@@ -1739,7 +1930,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                 #去掉标点符号
														
 
															                 entity_text = re.sub("[,，。：!&@$\*]","",entity_text)
														
 
															                 entity_text = entity_text.replace("(","（").replace(")","）") if isinstance(entity_text,str) else entity_text
														
 
															-                list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1]))
														
 
															+                list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
														
 
															             # 标记文章末尾的"发布人”、“发布时间”实体
														
 
															             if sentence_index==len(list_sentence)-1:
														
 
															                 if len(list_sentence_entitys[-2:])>2:
														
@@ -1974,7 +2165,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                             _exists = True
														
 
															                     if not _exists:
														
 
															                         if float(entity_text)>1:
														
 
															-                            list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp))
														
 
															+                            list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp,in_attachment=in_attachment))
														
 
															                             list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
														
 
															                             list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
														
 
															                             # print('预处理中的 金额:%s, 单位：%s'%(entity_text,unit))
														
@@ -2026,7 +2217,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                 entity_text = person['body']
														
 
															                 list_sentence_entitys.append(
														
 
															                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
														
 
															-                           begin_index_temp, end_index_temp))
														
 
															+                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
														
 
															             # 资金来源提取  2020/12/30 新增
														
 
															             list_moneySource = extract_moneySource(sentence_text)
														
@@ -2050,7 +2241,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                 entity_text = moneySource['body']
														
 
															                 list_sentence_entitys.append(
														
 
															                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
														
 
															-                           begin_index_temp, end_index_temp))
														
 
															+                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
														
 
															             # 电子邮箱提取 2021/11/04 新增
														
 
															             list_email = extract_email(sentence_text)
														
@@ -2074,7 +2265,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                 entity_text = email['body']
														
 
															                 list_sentence_entitys.append(
														
 
															                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
														
 
															-                           begin_index_temp, end_index_temp))
														
 
															+                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
														
 
															             # 服务期限提取 2020/12/30 新增
														
 
															             list_servicetime = extract_servicetime(sentence_text)
														
@@ -2098,7 +2289,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                 entity_text = servicetime['body']
														
 
															                 list_sentence_entitys.append(
														
 
															                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
														
 
															-                           begin_index_temp, end_index_temp))
														
 
															+                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
														
 
															             # 招标方式提取 2020/12/30 新增
														
 
															             # list_bidway = extract_bidway(sentence_text, )
														
@@ -2140,7 +2331,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                 entity_text = ratio['body']
														
 
															                 list_sentence_entitys.append(
														
 
															                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
														
 
															-                           begin_index_temp, end_index_temp))
														
 
															+                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
														
 
															             list_sentence_entitys.sort(key=lambda x:x.begin_index)
														
 
															             list_entitys_temp = list_entitys_temp+list_sentence_entitys
														
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -47,7 +47,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
															     start_time = time.time()
														
 
															     log("start process doc %s"%(str(doc_id)))
														
 
															-    list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
														
 
															+    list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
														
 
															     log("get preprocessed done of doc_id%s"%(doc_id))
														
 
															     cost_time["preprocess"] = round(time.time()-start_time,2)
														
 
															     cost_time.update(_cost_time)
														
@@ -124,7 +124,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
															     # 依赖句子顺序
														
 
															     start_time = time.time()  # 实体链接
														
 
															     entityLink.link_entitys(list_entitys)
														
 
															-    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
														
 
															+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
														
 
															     log("get attributes done of doc_id%s"%(doc_id))
														
 
															     cost_time["attrs"] = round(time.time()-start_time,2)
														
@@ -152,7 +152,6 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
															     #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
														
 
															     #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
														
 
															     #                str(_entity.begin_index),str(_entity.end_index)))
														
 
															-
														
 
															     return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)#, list_articles[0].content, list_entitys[0]
														
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -758,7 +758,7 @@ def getPackagesFromArticle(list_sentence,list_entity):
 
															                     continue
														
 
															                 #add package to entity
														
 
															-                _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
														
 
															+                _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"],in_attachment=list_sentence[i].in_attachment)
														
 
															                 list_entity.append(_pack_entity)
														
 
															                 copy_pack = copy.copy(PackageList_scope[j])
														
 
															                 copy_pack["scope"] = [scope_begin,scope_end]
														
@@ -790,7 +790,7 @@ def dispatch(match_list):
 
															 from BiddingKG.dl.common.Utils import getUnifyMoney
														
 
															 from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
														
 
															 relationExtraction_model = Model_relation_extraction()
														
 
															-def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,list_sentence,on_value = 0.5,on_value_person=0.5,sentence_len=4):
														
 
															+def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,on_value = 0.5,on_value_person=0.5,sentence_len=4):
														
 
															     '''
														
 
															     @param:
														
 
															         PackDict:文章包dict
														
@@ -1060,9 +1060,13 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                         distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
														
 
															                                            tokens_num_dict[entity.sentence_index] + entity.end_index)
														
 
															                         sentence_distance = after_entity.sentence_index - entity.sentence_index
														
 
															+                        value = (-1 / 2 * (distance ** 2)) / 10000
														
 
															+                        if link_attribute == "money":
														
 
															+                            if after_entity.notes == '单价':
														
 
															+                                value = value * 100
														
 
															                         if sentence_distance == 0:
														
 
															                             if distance < 100:
														
 
															-                                value = (-1 / 2 * (distance ** 2)) / 10000
														
 
															+                                # value = (-1 / 2 * (distance ** 2)) / 10000
														
 
															                                 temp_match_list.append(Match(entity, after_entity, value))
														
 
															                                 match_nums += 1
														
 
															                                 if not tenderer_nums:
														
@@ -1071,7 +1075,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                                     break
														
 
															                         else:
														
 
															                             if distance < 60:
														
 
															-                                value = (-1 / 2 * (distance ** 2)) / 10000
														
 
															+                                # value = (-1 / 2 * (distance ** 2)) / 10000
														
 
															                                 temp_match_list.append(Match(entity, after_entity, value))
														
 
															                                 match_nums += 1
														
 
															                                 if not tenderer_nums:
														
@@ -1242,6 +1246,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															     code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
														
 
															     for _sentence in list_sentence:
														
 
															         sentence_text = _sentence.sentence_text
														
 
															+        in_attachment = _sentence.in_attachment
														
 
															         list_tokenbegin = []
														
 
															         begin = 0
														
 
															         for i in range(0, len(_sentence.tokens)):
														
@@ -1306,7 +1311,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                 if not last_phone_mask:
														
 
															                     item_start = item[1]
														
 
															                     last_item_end = res_set[item_idx-1][2]
														
 
															-                    if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—－―]+$",sentence_text[last_item_end:item_start]):
														
 
															+                    if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—－―、]+$",sentence_text[last_item_end:item_start]):
														
 
															                         last_phone_mask = False
														
 
															                         continue
														
 
															             for j in range(len(list_tokenbegin)):
														
@@ -1321,7 +1326,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                     end_index = j - 1
														
 
															                     break
														
 
															             _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
														
 
															-                             item[2])
														
 
															+                             item[2],in_attachment=in_attachment)
														
 
															             phone_entitys.append(_entity)
														
 
															             last_phone_mask = True
														
@@ -1447,6 +1452,17 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                                 PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
														
 
															                                 break
														
 
															                 # print(3,combo[0].entity_text,combo[1].entity_text)
														
 
															+    # 多个招标人/代理人或者别称
														
 
															+    for idx in range(1,len(pre_entity)):
														
 
															+        _pre_entity = pre_entity[idx]
														
 
															+        if _pre_entity in linked_company and _pre_entity.label==5:
														
 
															+            last_ent = pre_entity[idx-1]
														
 
															+            if last_ent.entity_type in ['company','org'] and last_ent.label in [0,1]:
														
 
															+                if last_ent.sentence_index==_pre_entity.sentence_index:
														
 
															+                    mid_text = list_sentence[_pre_entity.sentence_index].sentence_text[last_ent.wordOffset_end:_pre_entity.wordOffset_begin]
														
 
															+                    if len(mid_text)<=20 and "，" not in mid_text and re.search("[、\(（]",mid_text):
														
 
															+                        _pre_entity.label = last_ent.label
														
 
															+                        _pre_entity.values[last_ent.label] = 0.6
														
 
															     # 2022/01/25 固定电话可连多个联系人
														
 
															     temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person']
														
 
															     temp_person_entitys2 = [] #和固定电话相连的联系人
														
@@ -1483,7 +1499,25 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                     last_person = before_entity
														
 
															                 else:
														
 
															                     break
														
 
															-
														
 
															+    # 更新person为招标/代理联系人的联系方式
														
 
															+    for k in PackDict.keys():
														
 
															+        for i in range(len(PackDict[k]["roleList"])):
														
 
															+            if PackDict[k]["roleList"][i].role_name == "tenderee":
														
 
															+                for _person in person_list:
														
 
															+                    if _person.label==1:#招标联系人
														
 
															+                        person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
														
 
															+                        for _p in person_phone:
														
 
															+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
														
 
															+                        if not person_phone:
														
 
															+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
														
 
															+            if PackDict[k]["roleList"][i].role_name == "agency":
														
 
															+                for _person in person_list:
														
 
															+                    if _person.label==2:#代理联系人
														
 
															+                        person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
														
 
															+                        for _p in person_phone:
														
 
															+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
														
 
															+                        if not person_phone:
														
 
															+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
														
 
															     # 更新 PackDict
														
 
															     not_sure_linked = []
														
 
															     for link_p in list(linked_company):
														
@@ -1736,10 +1770,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                         else:
														
 
															                             next_entity = split_entitys[index + 1]
														
 
															                             if next_entity.entity_type in ["org","company"]:
														
 
															-                                _entity_left = list_sentence[next_entity.sentence_index].sentence_text[max(0, next_entity.wordOffset_begin - 10):next_entity.wordOffset_begin]
														
 
															-                                _entity_left = re.sub("，（）\(\):：", "", _entity_left)
														
 
															-                                _entity_left = _entity_left[-5:]
														
 
															-                                if re.search("地址|地点", _entity_left):
														
 
															+                                _entity_left = list_sentence[next_entity.sentence_index].sentence_text[max(0, next_entity.wordOffset_begin - 20):next_entity.wordOffset_begin]
														
 
															+                                _entity_left2 = re.sub("，（）\(\):：", "", _entity_left)
														
 
															+                                _entity_left2 = _entity_left2[-5:]
														
 
															+                                if re.search("(地，?址|地，?点)[:：][^，。]*$", _entity_left) or re.search("地址|地点", _entity_left2):
														
 
															                                     if index + 2<= len(split_entitys) - 1:
														
 
															                                         next_entity = split_entitys[index + 2]
														
 
															                             if entity.sentence_index == next_entity.sentence_index:
														
@@ -2336,73 +2370,89 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
 
															                 if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
														
 
															                     PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
														
 
															                     # print('通过其他中标人投标金额校正中标金额')
														
 
															-
														
 
															     for item in list_pop:
														
 
															         PackDict.pop(item)
														
 
															-    # 公告中只有"招标人"且无"联系人"链接时，直接取文中倒数第一个联系人
														
 
															+    # 公告中只有"招标人"且无"联系人"链接时
														
 
															     if len(PackDict)==1:
														
 
															         k = list(PackDict.keys())[0]
														
 
															         if len(PackDict[k]["roleList"])==1:
														
 
															             if PackDict[k]["roleList"][0].role_name == "tenderee":
														
 
															                 if not PackDict[k]["roleList"][0].linklist:
														
 
															-                    for _entity in temporary_list2[::-1]:
														
 
															-                        if _entity.entity_type=='person' and _entity.label==3:
														
 
															-                            if _entity.person_phone:
														
 
															-                                _phone = [p.entity_text for p in _entity.person_phone]
														
 
															-                                for _p in _phone:
														
 
															-                                    PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
														
 
															-                                break
														
 
															-    # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时，如果文中只有一个“phone”实体，则直接取为联系人电话
														
 
															-    if len(PackDict)==1:
														
 
															-        k = list(PackDict.keys())[0]
														
 
															-        if len(PackDict[k]["roleList"])==1:
														
 
															-            if PackDict[k]["roleList"][0].role_name == "tenderee":
														
 
															-                if not PackDict[k]["roleList"][0].linklist:
														
 
															-                    if len(phone_entitys)==1:
														
 
															-                        PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
														
 
															-    # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时，通过大纲直接取电话
														
 
															-    if len(PackDict)==1:
														
 
															-        k = list(PackDict.keys())[0]
														
 
															-        if len(PackDict[k]["roleList"])==1:
														
 
															-            if PackDict[k]["roleList"][0].role_name == "tenderee":
														
 
															-                if not PackDict[k]["roleList"][0].linklist:
														
 
															-                    if len(new_split_list)>1:
														
 
															-                        for _start,_end in new_split_list:
														
 
															-                            temp_sentence = _content[_start:_end]
														
 
															-                            sentence_outline = temp_sentence.split("，")[0]
														
 
															-                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人",sentence_outline):
														
 
															-                                sentence_phone = phone.findall(temp_sentence)
														
 
															+                    get_contacts = False
														
 
															+                    if not get_contacts:
														
 
															+                        # 根据大纲Outline类召回联系人
														
 
															+                        for outline in list_outline:
														
 
															+                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人",outline.outline_summary):
														
 
															+                                for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]:
														
 
															+                                    if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[
														
 
															+                                        t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
														
 
															+                                        if t_person.person_phone:
														
 
															+                                            _phone = [p.entity_text for p in t_person.person_phone]
														
 
															+                                            for _p in _phone:
														
 
															+                                                PackDict[k]["roleList"][0].linklist.append((t_person.entity_text, _p))
														
 
															+                                            get_contacts = True
														
 
															+                                            break
														
 
															+                                    elif words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= \
														
 
															+                                            words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
														
 
															+                                        break
														
 
															+                                if not get_contacts:
														
 
															+                                    sentence_phone = phone.findall(outline.outline_text)
														
 
															+                                    if sentence_phone:
														
 
															+                                        PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
														
 
															+                                        get_contacts = True
														
 
															+                                        break
														
 
															+                    if not get_contacts:
														
 
															+                        # 直接取文中倒数第一个联系人
														
 
															+                        for _entity in temporary_list2[::-1]:
														
 
															+                            if _entity.entity_type=='person' and _entity.label==3:
														
 
															+                                if _entity.person_phone:
														
 
															+                                    _phone = [p.entity_text for p in _entity.person_phone]
														
 
															+                                    for _p in _phone:
														
 
															+                                        PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
														
 
															+                                    get_contacts = True
														
 
															+                                    break
														
 
															+                    if not get_contacts:
														
 
															+                        # 如果文中只有一个“phone”实体，则直接取为联系人电话
														
 
															+                        if len(phone_entitys) == 1:
														
 
															+                            PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
														
 
															+                            get_contacts = True
														
 
															+                    if not get_contacts:
														
 
															+                        # 通过大纲直接取电话
														
 
															+                        if len(new_split_list) > 1:
														
 
															+                            for _start, _end in new_split_list:
														
 
															+                                temp_sentence = _content[_start:_end]
														
 
															+                                sentence_outline = temp_sentence.split("，:：")[0]
														
 
															+                                if re.search("联系人|联系方|联系方式|联系电话|电话|负责人", sentence_outline):
														
 
															+                                    sentence_phone = phone.findall(temp_sentence)
														
 
															+                                    if sentence_phone:
														
 
															+                                        PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
														
 
															+                                        get_contacts = True
														
 
															+                                        break
														
 
															+                    if not get_contacts:
														
 
															+                        # 通过正则提取句子段落进行提取电话
														
 
															+                        contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[:：]?"
														
 
															+                        tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}"
														
 
															+                        contact_pattern_list = [tenderee_pattern + contacts_person,
														
 
															+                                                "(?:采购[^。，]{0,2}项目|采购事项|招标)[^。，]{0,4}" + contacts_person,
														
 
															+                                                "(?:项目|采购)[^。，]{0,4}" + contacts_person,
														
 
															+                                                "(?:报名|报价|业务咨询|业务|投标咨询)[^。，]{0,4}" + contacts_person, ]
														
 
															+                        for _pattern in contact_pattern_list:
														
 
															+                            get_tenderee_contacts = False
														
 
															+                            for regular_match in re.finditer(_pattern, _content):
														
 
															+                                match_text = _content[regular_match.end():regular_match.end() + 40]
														
 
															+                                match_text = match_text.split("。")[0]
														
 
															+                                sentence_phone = phone.findall(match_text)
														
 
															                                 if sentence_phone:
														
 
															                                     PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
														
 
															+                                    get_tenderee_contacts = True
														
 
															                                     break
														
 
															-    # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时，通过正则提取句子段落进行提取电话
														
 
															-    if len(PackDict)==1:
														
 
															-        k = list(PackDict.keys())[0]
														
 
															-        if len(PackDict[k]["roleList"])==1:
														
 
															-            if PackDict[k]["roleList"][0].role_name == "tenderee":
														
 
															-                if not PackDict[k]["roleList"][0].linklist:
														
 
															-                    contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[:：]?"
														
 
															-                    tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}"
														
 
															-                    contact_pattern_list = [tenderee_pattern + contacts_person,"(?:采购[^。，]{0,2}项目|采购事项|招标)[^。，]{0,4}" + contacts_person,
														
 
															-                                            "(?:项目|采购)[^。，]{0,4}"+contacts_person,"(?:报名|报价|业务咨询|业务|投标咨询)[^。，]{0,4}"+contacts_person,]
														
 
															-                    for _pattern in contact_pattern_list:
														
 
															-                        get_tenderee_contacts = False
														
 
															-                        for regular_match in re.finditer(_pattern,_content):
														
 
															-                            match_text = _content[regular_match.end():regular_match.end()+40]
														
 
															-                            match_text = match_text.split("。")[0]
														
 
															-                            sentence_phone = phone.findall(match_text)
														
 
															-                            if sentence_phone:
														
 
															-                                PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
														
 
															-                                get_tenderee_contacts = True
														
 
															+                            if get_tenderee_contacts:
														
 
															                                 break
														
 
															-                        if get_tenderee_contacts:
														
 
															-                            break
														
 
															     for pack in PackDict.keys():
														
 
															         for i in range(len(PackDict[pack]["roleList"])):
														
 
															             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
														
 
															-
														
 
															-    return PackDict 
														
 
															+    return PackDict
														
 
															 def initPackageAttr(RoleList,PackageSet):
														
 
															     '''
														
@@ -2419,7 +2469,7 @@ def initPackageAttr(RoleList,PackageSet):
 
															         packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称，实体名称，角色阈值，金额，金额阈值，连接列表，金额单位)
														
 
															     return packDict
														
 
															-def getPackageRoleMoney(list_sentence,list_entity):
														
 
															+def getPackageRoleMoney(list_sentence,list_entity,list_outline):
														
 
															     '''
														
 
															     @param:
														
 
															         list_sentence:文章的句子list
														
@@ -2435,11 +2485,9 @@ def getPackageRoleMoney(list_sentence,list_entity):
 
															     for item in PackageList:
														
 
															         # print(item)
														
 
															     '''
														
 
															-    # print("=2")
														
 
															     PackDict = initPackageAttr(RoleList, PackageSet)
														
 
															-    # print("=3")
														
 
															-    PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity, list_sentence)
														
 
															-    # print("=4")
														
 
															+
														
 
															+    PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline)
														
 
															     return PackDict
														
 
															 def turnBidWay(bidway):
														
@@ -2717,7 +2765,7 @@ def getOtherAttributes(list_entity):
 
															 def getMoneyRange(RoleList):
														
 
															     pass
														
 
															-def getPREMs(list_sentences,list_entitys,list_articles):
														
 
															+def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
														
 
															     '''
														
 
															     @param:
														
 
															         list_sentence:所有文章的句子list
														
@@ -2725,8 +2773,8 @@ def getPREMs(list_sentences,list_entitys,list_articles):
 
															     @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话  
														
 
															     '''
														
 
															     result = []
														
 
															-    for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
														
 
															-        RoleList = getPackageRoleMoney(list_sentence,list_entity)
														
 
															+    for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
														
 
															+        RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
														
 
															         result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
														
 
															                            **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
														
 
															                               "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
														
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -259,7 +259,22 @@ class Model_relation_extraction():
 
															         return text_data, pre_data
														
 
															     def predict(self,text_in, words, rate=0.5):
														
 
															-        # text_words = text_in
														
 
															+        # 没有需要预测的链接属性，直接return
														
 
															+        company_relation = 0
														
 
															+        person_relation = 0
														
 
															+        if '<company/org>' in words:
														
 
															+            company_relation += 1
														
 
															+        if '<contact_person>' in words:
														
 
															+            person_relation += 1
														
 
															+            if company_relation:
														
 
															+                company_relation += 1
														
 
															+        if '<location>' in words and company_relation:
														
 
															+            company_relation += 1
														
 
															+        if '<phone>' in words and company_relation:
														
 
															+            person_relation += 1
														
 
															+        if company_relation < 2 and person_relation < 2:
														
 
															+            return []
														
 
															+        # 使用模型预测
														
 
															         triple_list = []
														
 
															         # print("tokens:",words)
														
 
															         # _t2 = [self.words2id.get(c, 1) for c in words]
														
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -334,7 +334,7 @@ class CodeNamePredict():
 
															                         end = iter.span()[1]+get_len
														
 
															                         code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
														
 
															                         code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
														
 
															-                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]],entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1])
														
 
															+                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]],entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
														
 
															                         temp_entitys.append(_entity)
														
 
															                     #print("code",code_text)
														
 
															                     if len(code_x)>0:
														
@@ -381,7 +381,7 @@ class CodeNamePredict():
 
															                         _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
														
 
															                         #add name to entitys
														
 
															-                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1])
														
 
															+                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
														
 
															                         list_entity.append(_entity)
														
 
															                         w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[:：\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
														
 
															                         if _name not in dict_name_freq_score:
														
@@ -419,7 +419,7 @@ class CodeNamePredict():
 
															                         _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
														
 
															                         sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
														
 
															                                          entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
														
 
															-                                         end_index=0, wordOffset_begin=beg, wordOffset_end=end)
														
 
															+                                         end_index=0, wordOffset_begin=beg, wordOffset_end=end,in_attachment=sentence.in_attachment)
														
 
															                         list_entity.append(_entity)
														
 
															                         w = 1
														
 
															                         if _name not in dict_name_freq_score:
														
@@ -990,7 +990,7 @@ class EPCPredict():
 
															                         continue
														
 
															                     if re.search("[.,]\d{2,}",phone_right):
														
 
															                         continue
														
 
															-                    _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2])
														
 
															+                    _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2],in_attachment=_sentence.in_attachment)
														
 
															                     phone_entitys.append(_entity)
														
 
															             person_entitys = []
														
 
															             for entity in list_entity:
														
@@ -1644,7 +1644,7 @@ class ProductPredictor():
 
															                                                  entity_text=sentence.sentence_text[start:end],
														
 
															                                                  entity_type="product", sentence_index=sentence.sentence_index,
														
 
															                                                  begin_index=0, end_index=0, wordOffset_begin=start,
														
 
															-                                                 wordOffset_end=end)
														
 
															+                                                 wordOffset_end=end,in_attachment=sentence.in_attachment)
														
 
															                                 list_entity.append(_entity)
														
 
															                                 temp_list.append(sentence.sentence_text[start:end])
														
 
															                         # item["product"] = list(set(temp_list))
														
--- a/BiddingKG/dl/ratio/re_ratio.py
+++ b/BiddingKG/dl/ratio/re_ratio.py
@@ -1,6 +1,8 @@
 
															 import re
														
 
															-ratio = '([（(]?(上浮|下浮)(率|)(报价|)([(（]?%[）)]?|)[)）]?[：: ，]{0,3}[0-9]+.?[0-9]*[(（]?%?[）)]?)'
														
 
															+# ratio = '([（(]?(上浮|下浮)(率|)(报价|)([(（]?%[）)]?|)[)）]?[：: ，]{0,3}[0-9]+.?[0-9]*[(（]?%?[）)]?)'
														
 
															+ratio = '(([（(]?(上浮|下浮)费?(率|)(报价|)[)）]?|([中投]标|报价|总价)费率|折扣率)([(（]?%[）)]?|)[为：: ，]{0,3}[0-9]+\.?[0-9]{0,3}[(（]?%?[）)]?)'
														
 
															+# ratio = re.compile('(([（(]?(上浮|下浮)费?(率|)(报价|)[)）]?|([中投]标|报价|总价)费率|折扣率)([(（]?%[）)]?|)[为：: ，]{0,3}[0-9]+\.?[0-9]{0,3}[(（]?%?[）)]?)')
														
 
															 # 基准利率上浮率）：大写：百分之叁拾点零零，小写：30.00%，
														
 
															 # 基准利率上浮率：百分之三十（30%）
														
--- a/BiddingKG/dl/test/测试整个要素提取流程.py
+++ b/BiddingKG/dl/test/测试整个要素提取流程.py
@@ -69,19 +69,21 @@ class MyEncoder(json.JSONEncoder):
 
															 def predict(doc_id,text):
														
 
															-    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","",""]],useselffool=True)
														
 
															+    list_articles,list_sentences,list_entitys,list_outlines,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","",""]],useselffool=True)
														
 
															     for articles in list_articles:
														
 
															         print('预处理后文本信息')
														
 
															         print(articles.content)
														
 
															-    for sentences in list_sentences:
														
 
															-        for sentence in sentences:
														
 
															-            print(sentence.tokens)
														
 
															+    # for sentences in list_sentences:
														
 
															+    #     for sentence in sentences:
														
 
															+    #         # print(sentence.sentence_index,sentence.tokens)
														
 
															+    #         print(sentence.sentence_index,sentence.in_attachment,sentence.tokens)
														
 
															+    print("location:",[ent.entity_text for ent in list_entitys[0] if ent.entity_type=='location'])
														
 
															     ''''''
														
 
															     codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
														
 
															-    # print(codeName)
														
 
															+    print('codeName',codeName)
														
 
															     premPredict.predict(list_sentences,list_entitys)
														
 
															     # for entitys in list_entitys:
														
 
															     #     for entity in entitys:
														
@@ -98,7 +100,7 @@ def predict(doc_id,text):
 
															     # print("entityLink")
														
 
															     entityLink.link_entitys(list_entitys)
														
 
															     # print("getPREMs")
														
 
															-    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
														
 
															+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
														
 
															     # print("getPREMs")
														
 
															     print("公司——联系人：", end=' ')
														
 
															     print(prem[0])
														
@@ -129,12 +131,12 @@ def predict(doc_id,text):
 
															                 # print(entity.begin_index, entity.end_index)
														
 
															                 print(entity.sentence_index)
														
 
															                 pass
														
 
															-            elif entity.entity_type=="time":
														
 
															-                print("time:",end=" ")
														
 
															-                print(entity.entity_text, entity.label, entity.values)
														
 
															-            elif entity.entity_type=="email":
														
 
															-                print("email:",end=" ")
														
 
															-                print(entity.entity_text, entity.begin_index, entity.end_index)
														
 
															+            # elif entity.entity_type=="time":
														
 
															+            #     print("time:",end=" ")
														
 
															+            #     print(entity.entity_text, entity.label, entity.values)
														
 
															+            # elif entity.entity_type=="email":
														
 
															+            #     print("email:",end=" ")
														
 
															+            #     print(entity.entity_text, entity.begin_index, entity.end_index)
														
 
															             elif entity.entity_type in ['org','company']:
														
 
															                 _sentence = list_sentences[0][entity.sentence_index]
														
 
															                 print(entity.entity_type)
														
@@ -174,7 +176,8 @@ def predict(doc_id,text):
 
															             # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
														
 
															     #print(prem)
														
 
															-    return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
														
 
															+    # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
														
 
															+    return json.dumps(prem[0],cls=MyEncoder,sort_keys=True,indent=2,ensure_ascii=False)
														
 
															 # def test(name,content):
														
@@ -318,7 +321,7 @@ def predict_fromdb(docid, dbname="sys_document_23"):
 
															     # text = '竟然很明显的表达没识别为代理，代理机构名称：国信国采(北京)招标咨询有限责任公司，代理机构地址：北京市海淀区首体南路22号国兴大厦11层，  1.采购人信息名 称：北京市植物园。'
														
 
															     list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],useselffool=True)
														
 
															     codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
														
 
															-    # print(codeName)
														
 
															+    print('codeName',codeName)
														
 
															     premPredict.predict(list_sentences, list_entitys)
														
 
															     roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
														
 
															     # print("epcPredict")
														
@@ -447,7 +450,6 @@ if __name__=="__main__":
 
															     # 传真：0769-81216222，邮编：523000。(三)，采购人：东莞市道滘镇教育管理中心，地址：广东省东莞市道滘镇花园街1号，联系人：李先生，联系电话：0769-81332303，传真：/，邮编：523000，各有关当事人对中标、成交结果有异议的，可以在中标、成交公告发布之日起7个工作日内以书面形式向(政府采购代理机构)(或采购人)提出质疑，逾期将依法不予受理，'''
														
 
															     text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
														
 
															     content = str(BeautifulSoup(text).find("div", id="pcontent"))
														
 
															-    from BiddingKG.dl.interface.Preprocessing import tableToText
														
 
															     # print("tableToText:",tableToText(BeautifulSoup(re.sub("<html>|</html>|<body>|</body>","",content),"lxml")))
														
 
															 #     text = '''
														
 
															 # 采购代理机构：山东立行建设项目管理有限公司地址：山东省临沂市兰山县(区)柳青广州路与蒙河路交汇大官苑社区西沿街A区三楼南侧号，联系方式：17862288900，
														
@@ -459,7 +461,8 @@ if __name__=="__main__":
 
															     a = time.time()
														
 
															     print("start")
														
 
															     # print(predict("12",content))
														
 
															-    result = predict("12",text)
														
 
															+    print(predict("12",text))
														
 
															+    # result = predict("12",text)
														
 
															     # result = predict("12",content)
														
 
															     # print(json.loads(result))
														
 
															     #test("12",text)
														
--- a/BiddingKG/dl/time/re_servicetime.py
+++ b/BiddingKG/dl/time/re_servicetime.py
@@ -56,7 +56,7 @@ def re_serviceTime(text):
 
															                          u'|工期、)'
														
 
															                          u'|工期情况|划工期内|服务期内')
														
 
															-    reg_not1 = re.compile(u'(履行日期：见|服务期限应按|签订合同前，|服务期限应按'
														
 
															+    reg_not1 = re.compile(u'(履行日期：见|服务期限应按|签订合同前，'
														
 
															                           u'|务期限：1、|同签订日期：|证金在合同签|服务期限截止'
														
 
															                           u')')
														
@@ -68,9 +68,15 @@ def re_serviceTime(text):
 
															         input_str = text_list[index]
														
 
															         # 替换混淆词
														
 
															-        input_str = re.sub(reg_not, "####", input_str)
														
 
															-        input_str = re.sub(reg_not1, "######", input_str)
														
 
															-        input_str = re.sub(reg_not2, "##", input_str)
														
 
															+        # input_str = re.sub(reg_not, "####", input_str)
														
 
															+        # input_str = re.sub(reg_not1, "######", input_str)
														
 
															+        # input_str = re.sub(reg_not2, "##", input_str)
														
 
															+        for _reg_not in [reg_not,reg_not1,reg_not2]:
														
 
															+            match = re.findall(_reg_not, input_str)
														
 
															+            if match:
														
 
															+                for word in match:
														
 
															+                    instead = "#" * len(word)
														
 
															+                    input_str = re.sub(word, instead, input_str)
														
 
															         output_str, text_index = re_findAllResult(reg2, input_str)
														
 
															         if len(text_index) == 0: