Эх сурвалжийг харах

大纲分段、招标人联系方式召回方法

znj 3 жил өмнө
parent
commit
f4ba43bd11

+ 2 - 2
BiddingKG/dl/entityLink/entityLink.py

@@ -265,7 +265,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                                 begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
                                 begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
                                 end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
                                 end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                                 entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
-                                add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
+                                add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"],in_attachment=p_sentence.in_attachment)
                                 add_entity.if_dict_match = 1
                                 add_entity.if_dict_match = 1
                                 list_entity.append(add_entity)
                                 list_entity.append(add_entity)
 
 
@@ -330,7 +330,7 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                     begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
                     end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                     end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
-                    add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
+                    add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"],in_attachment=p_sentence.in_attachment)
                     list_entity.append(add_entity)
                     list_entity.append(add_entity)
                     range_entity.append(add_entity)
                     range_entity.append(add_entity)
 
 

+ 49 - 3
BiddingKG/dl/interface/Entitys.py

@@ -93,7 +93,7 @@ class Sentences():
     @summary:句子类
     @summary:句子类
     '''
     '''
     
     
-    def __init__(self,doc_id,sentence_index,sentence_text,tokens,pos_tags,ner_tags):
+    def __init__(self,doc_id,sentence_index,sentence_text,tokens,pos_tags,ner_tags,in_attachment=False):
         '''
         '''
         @param:
         @param:
             doc_id:文章的uuid
             doc_id:文章的uuid
@@ -110,6 +110,7 @@ class Sentences():
         self.tokens = tokens
         self.tokens = tokens
         self.pos_tags = pos_tags
         self.pos_tags = pos_tags
         self.ner_tags = ner_tags
         self.ner_tags = ner_tags
+        self.in_attachment = in_attachment # 2022/02/10添加,句子是否在附件中
 
 
     def toJson(self):
     def toJson(self):
         _dict = {"doc_id":self.doc_id,"sentence_index":self.sentence_index,"sentence_text":self.sentence_text,
         _dict = {"doc_id":self.doc_id,"sentence_index":self.sentence_index,"sentence_text":self.sentence_text,
@@ -123,12 +124,38 @@ class Sentences():
                          _dict.get("pos_tags"),_dict.get("ner_tags"))
                          _dict.get("pos_tags"),_dict.get("ner_tags"))
 
 
 
 
+class Outline():
+    '''
+    @summary:根据正则划分公告的大纲
+    '''
+
+    def __init__(self, doc_id,outline_index,outline_text, sentence_begin_index, sentence_end_index,wordOffset_begin,wordOffset_end):
+        '''
+        @param:
+            doc_id:文章的uuid
+            sentence_begin_index:开始的句子索引
+            sentence_end_index:结束的句子索引
+            wordOffset_begin:开始句子的起始字索引
+            wordOffset_end:结束句子的结尾字索引
+            outline_text:大纲的全文本
+            outline_summary:大纲的概要
+        '''
+        self.doc_id = doc_id
+        self.outline_index = outline_index
+        self.outline_text = outline_text
+        self.sentence_begin_index = sentence_begin_index
+        self.sentence_end_index = sentence_end_index
+        self.wordOffset_begin = wordOffset_begin
+        self.wordOffset_end = wordOffset_end
+        self.outline_summary = ""
+
+
 class Entity():
 class Entity():
     '''
     '''
     @summary:实体类
     @summary:实体类
     '''
     '''
     
     
-    def __init__(self,doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,wordOffset_begin=None,wordOffset_end=None,label=None,values=None,person_phone=None):
+    def __init__(self,doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,wordOffset_begin=None,wordOffset_end=None,label=None,values=None,person_phone=None,in_attachment=False):
         '''
         '''
         @param:
         @param:
             doc_id:文章的uuid
             doc_id:文章的uuid
@@ -175,6 +202,7 @@ class Entity():
         self.pointer_serviceTime = None  # 2022/01/05 新增,中标人对应链接"服务期限(工期)"
         self.pointer_serviceTime = None  # 2022/01/05 新增,中标人对应链接"服务期限(工期)"
         self.pointer_ratio = None  # 2022/01/05 新增,中标人对应链接"中投标金额->费率、下浮率"
         self.pointer_ratio = None  # 2022/01/05 新增,中标人对应链接"中投标金额->费率、下浮率"
         self.origin_entity_text = ''  # 2022/1/5 新增,记录字典替换的原来的实体名
         self.origin_entity_text = ''  # 2022/1/5 新增,记录字典替换的原来的实体名
+        self.in_attachment = in_attachment  # 2022/02/10添加,实体是否在附件中
 
 
     def set_Role(self,role_label,role_values):
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)
         self.label = int(role_label)
@@ -272,7 +300,25 @@ class Role():
         self.linklist = [item for item in set(self.linklist)]
         self.linklist = [item for item in set(self.linklist)]
         # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
         # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist]
         # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit]
         # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit]
-        result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit,self.ratio,self.serviceTime]
+        # result = [self.role_name,fitDataByRule(self.entity_text),self.money,self.linklist, self.money_unit,self.ratio,self.serviceTime]
+        floating_ratio = "" # 上浮率
+        downward_floating_ratio = "" # 下浮率
+        discount_ratio = "" # 折扣率/费率
+        if self.ratio:
+            num_value = re.search("[\d\.]+",self.ratio).group()
+            num_value = float(num_value)
+            if re.search("%|百分之",self.ratio):
+                num_value = num_value / 100
+            num_value = str('%.4f'%(num_value))
+            if re.search("上浮",self.ratio):
+                floating_ratio = num_value
+            elif re.search("下浮",self.ratio):
+                downward_floating_ratio = num_value
+            else:
+                discount_ratio = num_value
+        result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
+                  'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
+                  'linklist': self.linklist,'serviceTime':self.serviceTime}
         return result
         return result
 
 
 # 用于KM算法的组合配对
 # 用于KM算法的组合配对

+ 208 - 17
BiddingKG/dl/interface/Preprocessing.py

@@ -1019,6 +1019,112 @@ def tableToText(soup):
     return soup
     return soup
     # return list_innerTable
     # return list_innerTable
 
 
+re_num = re.compile("[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十]")
+num_dict = {
+    "一": 1, "二": 2,
+    "三": 3, "四": 4,
+    "五": 5, "六": 6,
+    "七": 7, "八": 8,
+    "九": 9, "十": 10}
+# 一百以内的中文大写转换为数字
+def change2num(text):
+    result_num = -1
+    # text = text[:6]
+    match = re_num.search(text)
+    if match:
+        _num = match.group()
+        if num_dict.get(_num):
+            return num_dict.get(_num)
+        else:
+            tenths = 1
+            the_unit = 0
+            num_split = _num.split("十")
+            if num_dict.get(num_split[0]):
+                tenths = num_dict.get(num_split[0])
+            if num_dict.get(num_split[1]):
+                the_unit = num_dict.get(num_split[1])
+            result_num = tenths * 10 + the_unit
+    elif re.search("\d{1,2}",text):
+        _num = re.search("\d{1,2}",text).group()
+        result_num = int(_num)
+    return result_num
+#大纲分段处理
+def get_preprocessed_outline(soup):
+    pattern_0 = re.compile("^(?:[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十])[、.\.]")
+    pattern_1 = re.compile("^[\((]?(?:[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十])[\))]")
+    pattern_2 = re.compile("^\d{1,2}[、.\.](?=[^\d]{1,2}|$)")
+    pattern_3 = re.compile("^[\((]?\d{1,2}[\))]")
+    pattern_list = [pattern_0, pattern_1, pattern_2, pattern_3]
+
+    body = soup.find("body")
+    body_child = body.find_all(recursive=False)
+    deal_part = body
+    # print(body_child[0]['id'])
+    if 'id' in body_child[0].attrs:
+        if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
+            deal_part = body_child[0]
+    if len(deal_part.find_all(recursive=False))>2:
+        deal_part = deal_part.parent
+    skip_tag = ['turntable', 'tbody', 'th', 'tr', 'td', 'table','<thead>','<tfoot>']
+    for part in deal_part.find_all(recursive=False):
+        # 查找解析文本的主干部分
+        is_main_text = False
+        through_text_num = 0
+        while (not is_main_text and part.find_all(recursive=False)):
+            while len(part.find_all(recursive=False)) == 1 and part.get_text(strip=True) == \
+                    part.find_all(recursive=False)[0].get_text(strip=True):
+                part = part.find_all(recursive=False)[0]
+            max_len = len(part.get_text(strip=True))
+            is_main_text = True
+            for t_part in part.find_all(recursive=False):
+                if t_part.name not in skip_tag and t_part.get_text(strip=True)!="":
+                    through_text_num += 1
+                if t_part.get_text(strip=True)!="" and len(t_part.get_text(strip=True))/max_len>=0.65:
+                    if t_part.name not in skip_tag:
+                        is_main_text = False
+                        part = t_part
+                        break
+                    else:
+                        while len(t_part.find_all(recursive=False)) == 1 and t_part.get_text(strip=True) == \
+                                t_part.find_all(recursive=False)[0].get_text(strip=True):
+                            t_part = t_part.find_all(recursive=False)[0]
+                        if through_text_num>2:
+                            is_table = True
+                            for _t_part in t_part.find_all(recursive=False):
+                                if _t_part.name not in skip_tag:
+                                    is_table = False
+                                    break
+                            if not is_table:
+                                is_main_text = False
+                                part = t_part
+                                break
+                        else:
+                            is_main_text = False
+                            part = t_part
+                            break
+        is_find = False
+        for _pattern in pattern_list:
+            last_index = 0
+            handle_list = []
+            for _part in part.find_all(recursive=False):
+                if _part.name not in skip_tag and _part.get_text(strip=True) != "":
+                    # print('text:', _part.get_text(strip=True))
+                    re_match = re.search(_pattern, _part.get_text(strip=True))
+                    if re_match:
+                        outline_index = change2num(re_match.group())
+                        if last_index < outline_index:
+                            # _part.insert_before("##split##")
+                            handle_list.append(_part)
+                            last_index = outline_index
+            if len(handle_list)>1:
+                is_find = True
+                for _part in handle_list:
+                    _part.insert_before("##split##")
+            if is_find:
+                break
+    # print(soup)
+    return soup
+
 #数据清洗
 #数据清洗
 def segment(soup,final=True):
 def segment(soup,final=True):
     # print("==")
     # print("==")
@@ -1071,6 +1177,11 @@ def segment(soup,final=True):
             child.insert_after("。")
             child.insert_after("。")
         if child.name in commaList:
         if child.name in commaList:
             child.insert_after(",")
             child.insert_after(",")
+        if child.name == 'div' and 'class' in child.attrs:
+            # 添加附件"attachment"标识
+            if "richTextFetch" in child['class']:
+                child.insert_before("##attachment##")
+                # print(child.parent)
         # if child.name in subspaceList:
         # if child.name in subspaceList:
         #     child.insert_before("#subs"+str(child.name)+"#")
         #     child.insert_before("#subs"+str(child.name)+"#")
         #     child.insert_after("#sube"+str(child.name)+"#")
         #     child.insert_after("#sube"+str(child.name)+"#")
@@ -1164,7 +1275,8 @@ def segment(soup,final=True):
             _text += re.sub(")",")",re.sub("(","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
             _text += re.sub(")",")",re.sub("(","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
             LOOP_BEGIN += LOOP_LEN
             LOOP_BEGIN += LOOP_LEN
         text = _text
         text = _text
-
+    # 附件标识前修改为句号,避免正文和附件内容混合在一起
+    text = re.sub("[^。](?=##attachment##)","。",text)
     return text
     return text
 
 
 '''
 '''
@@ -1467,12 +1579,12 @@ def get_preprocessed(articles,useselffool=False):
     '''
     '''
     cost_time = dict()
     cost_time = dict()
     list_articles = get_preprocessed_article(articles,cost_time)
     list_articles = get_preprocessed_article(articles,cost_time)
-    list_sentences = get_preprocessed_sentences(list_articles,True,cost_time)
+    list_sentences,list_outlines = get_preprocessed_sentences(list_articles,True,cost_time)
     list_entitys = get_preprocessed_entitys(list_sentences,True,cost_time)
     list_entitys = get_preprocessed_entitys(list_sentences,True,cost_time)
 
 
     calibrateEnterprise(list_articles,list_sentences,list_entitys)
     calibrateEnterprise(list_articles,list_sentences,list_entitys)
 
 
-    return list_articles,list_sentences,list_entitys,cost_time
+    return list_articles,list_sentences,list_entitys,list_outlines,cost_time
     
     
 
 
 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
@@ -1486,13 +1598,36 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         doc_id = article[0]
         doc_id = article[0]
         sourceContent = article[1]
         sourceContent = article[1]
         sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
         sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
+
+        sourceContent = sourceContent.replace('<br/>', '<br>')
+        sourceContent = re.sub("<br>(\s{0,}<br>)+","<br>",sourceContent)
+        for br_match in re.findall("[^>]+?<br>",sourceContent):
+            _new = re.sub("<br>","",br_match)
+            # <br>标签替换为<p>标签
+            if not re.search("^\s+$",_new):
+                _new = '<p>'+_new + '</p>'
+                # print(br_match,_new)
+                sourceContent = sourceContent.replace(br_match,_new,1)
+
         _send_doc_id = article[3]
         _send_doc_id = article[3]
         _title = article[4]
         _title = article[4]
         page_time = article[5]
         page_time = article[5]
         #表格处理
         #表格处理
         key_preprocess = "tableToText"
         key_preprocess = "tableToText"
         start_time = time.time()
         start_time = time.time()
-        article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
+        # article_processed = tableToText(BeautifulSoup(sourceContent,"lxml"))
+        article_processed = BeautifulSoup(sourceContent,"lxml")
+        # article_processed = preprocessed_html(article_processed,"")
+        for _soup in article_processed.descendants:
+            # 识别无标签文本,添加<p>标签
+            if not _soup.name and not _soup.parent.string and _soup.string.strip()!="":
+                # print(_soup.parent.string,_soup.string.strip())
+                _soup.wrap(article_processed.new_tag("p"))
+        # print(article_processed)
+        article_processed = get_preprocessed_outline(article_processed)
+        article_processed = tableToText(article_processed)
+        # print(article_processed)
+        article_processed = segment(article_processed)
         article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
         article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
@@ -1547,6 +1682,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
     :return: list_sentences
     :return: list_sentences
     '''
     '''
     list_sentences = []
     list_sentences = []
+    list_outlines = []
     for article in list_articles:
     for article in list_articles:
         list_sentences_temp = []
         list_sentences_temp = []
         list_entitys_temp = []
         list_entitys_temp = []
@@ -1557,7 +1693,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
         key_preprocess = "tableToText"
         key_preprocess = "tableToText"
         start_time = time.time()
         start_time = time.time()
         article_processed = article.content
         article_processed = article.content
-
+        attachment_begin_index = -1
 
 
         if key_preprocess not in cost_time:
         if key_preprocess not in cost_time:
             cost_time[key_preprocess] = 0
             cost_time[key_preprocess] = 0
@@ -1572,13 +1708,66 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
             for _iter in re.finditer(split_patten,article_processed):
             for _iter in re.finditer(split_patten,article_processed):
                 _sen = article_processed[_begin:_iter.span()[1]]
                 _sen = article_processed[_begin:_iter.span()[1]]
                 if len(_sen)>0 and _sen not in sentences_set:
                 if len(_sen)>0 and _sen not in sentences_set:
+                    # 标识在附件里的句子
+                    if re.search("##attachment##",_sen):
+                        attachment_begin_index = len(sentences)
+                        # _sen = re.sub("##attachment##","",_sen)
                     sentences.append(_sen)
                     sentences.append(_sen)
                     sentences_set.add(_sen)
                     sentences_set.add(_sen)
                 _begin = _iter.span()[1]
                 _begin = _iter.span()[1]
             _sen = article_processed[_begin:]
             _sen = article_processed[_begin:]
+            if re.search("##attachment##", _sen):
+                # _sen = re.sub("##attachment##", "", _sen)
+                attachment_begin_index = len(sentences)
             if len(_sen)>0 and _sen not in sentences_set:
             if len(_sen)>0 and _sen not in sentences_set:
                 sentences.append(_sen)
                 sentences.append(_sen)
                 sentences_set.add(_sen)
                 sentences_set.add(_sen)
+            # 解析outline大纲分段
+            outline_list = []
+            if re.search("##split##",article.content):
+                temp_sentences = []
+                last_sentence_index = (-1,-1)
+                outline_index = 0
+                for sentence_index in range(len(sentences)):
+                    sentence_text = sentences[sentence_index]
+                    for _ in re.findall("##split##", sentence_text):
+                        _match = re.search("##split##", sentence_text)
+                        if last_sentence_index[0] > -1:
+                            sentence_begin_index,wordOffset_begin = last_sentence_index
+                            sentence_end_index = sentence_index
+                            wordOffset_end = _match.start()
+                            if sentence_begin_index<attachment_begin_index and sentence_end_index>=attachment_begin_index:
+                                outline_list.append(Outline(doc_id,outline_index,'',sentence_begin_index,attachment_begin_index-1,wordOffset_begin,len(sentences[attachment_begin_index-1])))
+                            else:
+                                outline_list.append(Outline(doc_id,outline_index,'',sentence_begin_index,sentence_end_index,wordOffset_begin,wordOffset_end))
+                            outline_index += 1
+                        sentence_text = re.sub("##split##", "", sentence_text,count=1)
+                        last_sentence_index = (sentence_index,_match.start())
+                    temp_sentences.append(sentence_text)
+                if attachment_begin_index>-1 and last_sentence_index[0]<attachment_begin_index:
+                    outline_list.append(Outline(doc_id,outline_index,'',last_sentence_index[0],attachment_begin_index-1,last_sentence_index[1],len(temp_sentences[attachment_begin_index-1])))
+                else:
+                    outline_list.append(Outline(doc_id,outline_index,'',last_sentence_index[0],len(sentences)-1,last_sentence_index[1],len(temp_sentences[-1])))
+                sentences = temp_sentences
+            #解析outline的outline_text内容
+            for _outline in outline_list:
+                if _outline.sentence_begin_index==_outline.sentence_end_index:
+                    _text = sentences[_outline.sentence_begin_index][_outline.wordOffset_begin:_outline.wordOffset_end]
+                else:
+                    _text = ""
+                    for idx in range(_outline.sentence_begin_index,_outline.sentence_end_index+1):
+                        if idx==_outline.sentence_begin_index:
+                            _text += sentences[idx][_outline.wordOffset_begin:]
+                        elif idx==_outline.sentence_end_index:
+                            _text += sentences[idx][:_outline.wordOffset_end]
+                        else:
+                            _text += sentences[idx]
+                _outline.outline_text = _text
+                _outline_summary = re.split("[::,]",_text,1)[0]
+                if len(_outline_summary)<20:
+                    _outline.outline_summary = _outline_summary
+                # print(_outline.outline_index,_outline.outline_text)
+
             article.content = "".join(sentences)
             article.content = "".join(sentences)
             # sentences.append(article_processed[_begin:])
             # sentences.append(article_processed[_begin:])
 
 
@@ -1603,9 +1792,10 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
                 cost_time[key_nerToken] = 0
                 cost_time[key_nerToken] = 0
             cost_time[key_nerToken] += round(time.time()-start_time,2)
             cost_time[key_nerToken] += round(time.time()-start_time,2)
 
 
-
+            in_attachment = False
             for sentence_index in range(len(sentences)):
             for sentence_index in range(len(sentences)):
-
+                if sentence_index == attachment_begin_index:
+                    in_attachment = True
                 sentence_text = sentences[sentence_index]
                 sentence_text = sentences[sentence_index]
                 tokens = tokens_all[sentence_index]
                 tokens = tokens_all[sentence_index]
 
 
@@ -1614,12 +1804,12 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
 
                 ner_entitys = ""
                 ner_entitys = ""
 
 
-                list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=sentence_index,sentence_text=sentence_text,tokens=tokens,pos_tags=pos_tag,ner_tags=ner_entitys))
-
+                list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=sentence_index,sentence_text=sentence_text,tokens=tokens,pos_tags=pos_tag,ner_tags=ner_entitys,in_attachment=in_attachment))
         if len(list_sentences_temp)==0:
         if len(list_sentences_temp)==0:
             list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=0,sentence_text="sentence_text",tokens=[],pos_tags=[],ner_tags=""))
             list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=0,sentence_text="sentence_text",tokens=[],pos_tags=[],ner_tags=""))
         list_sentences.append(list_sentences_temp)
         list_sentences.append(list_sentences_temp)
-    return list_sentences
+        list_outlines.append(outline_list)
+    return list_sentences,list_outlines
 
 
 def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
     '''
     '''
@@ -1666,6 +1856,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             sentence_text = list_sentence[sentence_index].sentence_text
             sentence_text = list_sentence[sentence_index].sentence_text
             tokens = list_sentence[sentence_index].tokens
             tokens = list_sentence[sentence_index].tokens
             doc_id = list_sentence[sentence_index].doc_id
             doc_id = list_sentence[sentence_index].doc_id
+            in_attachment = list_sentence[sentence_index].in_attachment
             list_tokenbegin = []
             list_tokenbegin = []
             begin = 0
             begin = 0
             for i in range(0,len(tokens)):
             for i in range(0,len(tokens)):
@@ -1739,7 +1930,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 #去掉标点符号
                 #去掉标点符号
                 entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
                 entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
-                list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1]))
+                list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
             # 标记文章末尾的"发布人”、“发布时间”实体
             # 标记文章末尾的"发布人”、“发布时间”实体
             if sentence_index==len(list_sentence)-1:
             if sentence_index==len(list_sentence)-1:
                 if len(list_sentence_entitys[-2:])>2:
                 if len(list_sentence_entitys[-2:])>2:
@@ -1974,7 +2165,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             _exists = True
                             _exists = True
                     if not _exists:
                     if not _exists:
                         if float(entity_text)>1:
                         if float(entity_text)>1:
-                            list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp))
+                            list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp,in_attachment=in_attachment))
                             list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
                             list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
                             list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
                             list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
                             # print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
                             # print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
@@ -2026,7 +2217,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_text = person['body']
                 entity_text = person['body']
                 list_sentence_entitys.append(
                 list_sentence_entitys.append(
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp))
+                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
 
 
             # 资金来源提取  2020/12/30 新增
             # 资金来源提取  2020/12/30 新增
             list_moneySource = extract_moneySource(sentence_text)
             list_moneySource = extract_moneySource(sentence_text)
@@ -2050,7 +2241,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_text = moneySource['body']
                 entity_text = moneySource['body']
                 list_sentence_entitys.append(
                 list_sentence_entitys.append(
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp))
+                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
 
 
             # 电子邮箱提取 2021/11/04 新增
             # 电子邮箱提取 2021/11/04 新增
             list_email = extract_email(sentence_text)
             list_email = extract_email(sentence_text)
@@ -2074,7 +2265,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_text = email['body']
                 entity_text = email['body']
                 list_sentence_entitys.append(
                 list_sentence_entitys.append(
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp))
+                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
 
 
             # 服务期限提取 2020/12/30 新增
             # 服务期限提取 2020/12/30 新增
             list_servicetime = extract_servicetime(sentence_text)
             list_servicetime = extract_servicetime(sentence_text)
@@ -2098,7 +2289,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_text = servicetime['body']
                 entity_text = servicetime['body']
                 list_sentence_entitys.append(
                 list_sentence_entitys.append(
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp))
+                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
 
 
             # 招标方式提取 2020/12/30 新增
             # 招标方式提取 2020/12/30 新增
             # list_bidway = extract_bidway(sentence_text, )
             # list_bidway = extract_bidway(sentence_text, )
@@ -2140,7 +2331,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_text = ratio['body']
                 entity_text = ratio['body']
                 list_sentence_entitys.append(
                 list_sentence_entitys.append(
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
                     Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp))
+                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
 
 
             list_sentence_entitys.sort(key=lambda x:x.begin_index)
             list_sentence_entitys.sort(key=lambda x:x.begin_index)
             list_entitys_temp = list_entitys_temp+list_sentence_entitys
             list_entitys_temp = list_entitys_temp+list_sentence_entitys

+ 2 - 3
BiddingKG/dl/interface/extract.py

@@ -47,7 +47,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
 
     start_time = time.time()
     start_time = time.time()
     log("start process doc %s"%(str(doc_id)))
     log("start process doc %s"%(str(doc_id)))
-    list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
+    list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
     log("get preprocessed done of doc_id%s"%(doc_id))
     log("get preprocessed done of doc_id%s"%(doc_id))
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time.update(_cost_time)
     cost_time.update(_cost_time)
@@ -124,7 +124,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     # 依赖句子顺序
     # 依赖句子顺序
     start_time = time.time()  # 实体链接
     start_time = time.time()  # 实体链接
     entityLink.link_entitys(list_entitys)
     entityLink.link_entitys(list_entitys)
-    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
     log("get attributes done of doc_id%s"%(doc_id))
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
     cost_time["attrs"] = round(time.time()-start_time,2)
 
 
@@ -152,7 +152,6 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
     #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
     #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
     #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
     #                str(_entity.begin_index),str(_entity.end_index)))
     #                str(_entity.begin_index),str(_entity.end_index)))
-
     return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)#, list_articles[0].content, list_entitys[0]
     return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)#, list_articles[0].content, list_entitys[0]
 
 
 
 

+ 119 - 71
BiddingKG/dl/interface/getAttributes.py

@@ -758,7 +758,7 @@ def getPackagesFromArticle(list_sentence,list_entity):
                     continue
                     continue
 
 
                 #add package to entity
                 #add package to entity
-                _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
+                _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"],in_attachment=list_sentence[i].in_attachment)
                 list_entity.append(_pack_entity)
                 list_entity.append(_pack_entity)
                 copy_pack = copy.copy(PackageList_scope[j])
                 copy_pack = copy.copy(PackageList_scope[j])
                 copy_pack["scope"] = [scope_begin,scope_end]
                 copy_pack["scope"] = [scope_begin,scope_end]
@@ -790,7 +790,7 @@ def dispatch(match_list):
 from BiddingKG.dl.common.Utils import getUnifyMoney
 from BiddingKG.dl.common.Utils import getUnifyMoney
 from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
 from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
 relationExtraction_model = Model_relation_extraction()
 relationExtraction_model = Model_relation_extraction()
-def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,list_sentence,on_value = 0.5,on_value_person=0.5,sentence_len=4):
+def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_sentence,list_entity,list_outline,on_value = 0.5,on_value_person=0.5,sentence_len=4):
     '''
     '''
     @param:
     @param:
         PackDict:文章包dict
         PackDict:文章包dict
@@ -1060,9 +1060,13 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                         distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
                         distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
                                            tokens_num_dict[entity.sentence_index] + entity.end_index)
                                            tokens_num_dict[entity.sentence_index] + entity.end_index)
                         sentence_distance = after_entity.sentence_index - entity.sentence_index
                         sentence_distance = after_entity.sentence_index - entity.sentence_index
+                        value = (-1 / 2 * (distance ** 2)) / 10000
+                        if link_attribute == "money":
+                            if after_entity.notes == '单价':
+                                value = value * 100
                         if sentence_distance == 0:
                         if sentence_distance == 0:
                             if distance < 100:
                             if distance < 100:
-                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                # value = (-1 / 2 * (distance ** 2)) / 10000
                                 temp_match_list.append(Match(entity, after_entity, value))
                                 temp_match_list.append(Match(entity, after_entity, value))
                                 match_nums += 1
                                 match_nums += 1
                                 if not tenderer_nums:
                                 if not tenderer_nums:
@@ -1071,7 +1075,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                     break
                                     break
                         else:
                         else:
                             if distance < 60:
                             if distance < 60:
-                                value = (-1 / 2 * (distance ** 2)) / 10000
+                                # value = (-1 / 2 * (distance ** 2)) / 10000
                                 temp_match_list.append(Match(entity, after_entity, value))
                                 temp_match_list.append(Match(entity, after_entity, value))
                                 match_nums += 1
                                 match_nums += 1
                                 if not tenderer_nums:
                                 if not tenderer_nums:
@@ -1242,6 +1246,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
     code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
     code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
     for _sentence in list_sentence:
     for _sentence in list_sentence:
         sentence_text = _sentence.sentence_text
         sentence_text = _sentence.sentence_text
+        in_attachment = _sentence.in_attachment
         list_tokenbegin = []
         list_tokenbegin = []
         begin = 0
         begin = 0
         for i in range(0, len(_sentence.tokens)):
         for i in range(0, len(_sentence.tokens)):
@@ -1306,7 +1311,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                 if not last_phone_mask:
                 if not last_phone_mask:
                     item_start = item[1]
                     item_start = item[1]
                     last_item_end = res_set[item_idx-1][2]
                     last_item_end = res_set[item_idx-1][2]
-                    if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―]+$",sentence_text[last_item_end:item_start]):
+                    if item_start - last_item_end<=1 or re.search("^[\da-zA-Z\-—-―]+$",sentence_text[last_item_end:item_start]):
                         last_phone_mask = False
                         last_phone_mask = False
                         continue
                         continue
             for j in range(len(list_tokenbegin)):
             for j in range(len(list_tokenbegin)):
@@ -1321,7 +1326,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                     end_index = j - 1
                     end_index = j - 1
                     break
                     break
             _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
             _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
-                             item[2])
+                             item[2],in_attachment=in_attachment)
             phone_entitys.append(_entity)
             phone_entitys.append(_entity)
             last_phone_mask = True
             last_phone_mask = True
 
 
@@ -1447,6 +1452,17 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                 PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
                                 PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
                                 break
                                 break
                 # print(3,combo[0].entity_text,combo[1].entity_text)
                 # print(3,combo[0].entity_text,combo[1].entity_text)
+    # 多个招标人/代理人或者别称
+    for idx in range(1,len(pre_entity)):
+        _pre_entity = pre_entity[idx]
+        if _pre_entity in linked_company and _pre_entity.label==5:
+            last_ent = pre_entity[idx-1]
+            if last_ent.entity_type in ['company','org'] and last_ent.label in [0,1]:
+                if last_ent.sentence_index==_pre_entity.sentence_index:
+                    mid_text = list_sentence[_pre_entity.sentence_index].sentence_text[last_ent.wordOffset_end:_pre_entity.wordOffset_begin]
+                    if len(mid_text)<=20 and "," not in mid_text and re.search("[、\((]",mid_text):
+                        _pre_entity.label = last_ent.label
+                        _pre_entity.values[last_ent.label] = 0.6
     # 2022/01/25 固定电话可连多个联系人
     # 2022/01/25 固定电话可连多个联系人
     temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person']
     temp_person_entitys = [entity for entity in pre_entity if entity.entity_type == 'person']
     temp_person_entitys2 = [] #和固定电话相连的联系人
     temp_person_entitys2 = [] #和固定电话相连的联系人
@@ -1483,7 +1499,25 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                     last_person = before_entity
                     last_person = before_entity
                 else:
                 else:
                     break
                     break
-
+    # 更新person为招标/代理联系人的联系方式
+    for k in PackDict.keys():
+        for i in range(len(PackDict[k]["roleList"])):
+            if PackDict[k]["roleList"][i].role_name == "tenderee":
+                for _person in person_list:
+                    if _person.label==1:#招标联系人
+                        person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
+                        for _p in person_phone:
+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
+                        if not person_phone:
+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
+            if PackDict[k]["roleList"][i].role_name == "agency":
+                for _person in person_list:
+                    if _person.label==2:#代理联系人
+                        person_phone = [phone for phone in _person.person_phone] if _person.person_phone else []
+                        for _p in person_phone:
+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text, _p.entity_text))
+                        if not person_phone:
+                            PackDict[k]["roleList"][i].linklist.append((_person.entity_text,""))
     # 更新 PackDict
     # 更新 PackDict
     not_sure_linked = []
     not_sure_linked = []
     for link_p in list(linked_company):
     for link_p in list(linked_company):
@@ -1736,10 +1770,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                         else:
                         else:
                             next_entity = split_entitys[index + 1]
                             next_entity = split_entitys[index + 1]
                             if next_entity.entity_type in ["org","company"]:
                             if next_entity.entity_type in ["org","company"]:
-                                _entity_left = list_sentence[next_entity.sentence_index].sentence_text[max(0, next_entity.wordOffset_begin - 10):next_entity.wordOffset_begin]
-                                _entity_left = re.sub(",()\(\)::", "", _entity_left)
-                                _entity_left = _entity_left[-5:]
-                                if re.search("地址|地点", _entity_left):
+                                _entity_left = list_sentence[next_entity.sentence_index].sentence_text[max(0, next_entity.wordOffset_begin - 20):next_entity.wordOffset_begin]
+                                _entity_left2 = re.sub(",()\(\)::", "", _entity_left)
+                                _entity_left2 = _entity_left2[-5:]
+                                if re.search("(地,?址|地,?点)[::][^,。]*$", _entity_left) or re.search("地址|地点", _entity_left2):
                                     if index + 2<= len(split_entitys) - 1:
                                     if index + 2<= len(split_entitys) - 1:
                                         next_entity = split_entitys[index + 2]
                                         next_entity = split_entitys[index + 2]
                             if entity.sentence_index == next_entity.sentence_index:
                             if entity.sentence_index == next_entity.sentence_index:
@@ -2336,73 +2370,89 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                 if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
                 if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
                     PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
                     PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
                     # print('通过其他中标人投标金额校正中标金额')
                     # print('通过其他中标人投标金额校正中标金额')
-
     for item in list_pop:
     for item in list_pop:
         PackDict.pop(item)
         PackDict.pop(item)
-    # 公告中只有"招标人"且无"联系人"链接时,直接取文中倒数第一个联系人
+    # 公告中只有"招标人"且无"联系人"链接时
     if len(PackDict)==1:
     if len(PackDict)==1:
         k = list(PackDict.keys())[0]
         k = list(PackDict.keys())[0]
         if len(PackDict[k]["roleList"])==1:
         if len(PackDict[k]["roleList"])==1:
             if PackDict[k]["roleList"][0].role_name == "tenderee":
             if PackDict[k]["roleList"][0].role_name == "tenderee":
                 if not PackDict[k]["roleList"][0].linklist:
                 if not PackDict[k]["roleList"][0].linklist:
-                    for _entity in temporary_list2[::-1]:
-                        if _entity.entity_type=='person' and _entity.label==3:
-                            if _entity.person_phone:
-                                _phone = [p.entity_text for p in _entity.person_phone]
-                                for _p in _phone:
-                                    PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
-                                break
-    # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时,如果文中只有一个“phone”实体,则直接取为联系人电话
-    if len(PackDict)==1:
-        k = list(PackDict.keys())[0]
-        if len(PackDict[k]["roleList"])==1:
-            if PackDict[k]["roleList"][0].role_name == "tenderee":
-                if not PackDict[k]["roleList"][0].linklist:
-                    if len(phone_entitys)==1:
-                        PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
-    # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时,通过大纲直接取电话
-    if len(PackDict)==1:
-        k = list(PackDict.keys())[0]
-        if len(PackDict[k]["roleList"])==1:
-            if PackDict[k]["roleList"][0].role_name == "tenderee":
-                if not PackDict[k]["roleList"][0].linklist:
-                    if len(new_split_list)>1:
-                        for _start,_end in new_split_list:
-                            temp_sentence = _content[_start:_end]
-                            sentence_outline = temp_sentence.split(",")[0]
-                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人",sentence_outline):
-                                sentence_phone = phone.findall(temp_sentence)
+                    get_contacts = False
+                    if not get_contacts:
+                        # 根据大纲Outline类召回联系人
+                        for outline in list_outline:
+                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人",outline.outline_summary):
+                                for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]:
+                                    if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[
+                                        t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
+                                        if t_person.person_phone:
+                                            _phone = [p.entity_text for p in t_person.person_phone]
+                                            for _p in _phone:
+                                                PackDict[k]["roleList"][0].linklist.append((t_person.entity_text, _p))
+                                            get_contacts = True
+                                            break
+                                    elif words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= \
+                                            words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
+                                        break
+                                if not get_contacts:
+                                    sentence_phone = phone.findall(outline.outline_text)
+                                    if sentence_phone:
+                                        PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
+                                        get_contacts = True
+                                        break
+                    if not get_contacts:
+                        # 直接取文中倒数第一个联系人
+                        for _entity in temporary_list2[::-1]:
+                            if _entity.entity_type=='person' and _entity.label==3:
+                                if _entity.person_phone:
+                                    _phone = [p.entity_text for p in _entity.person_phone]
+                                    for _p in _phone:
+                                        PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
+                                    get_contacts = True
+                                    break
+                    if not get_contacts:
+                        # 如果文中只有一个“phone”实体,则直接取为联系人电话
+                        if len(phone_entitys) == 1:
+                            PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
+                            get_contacts = True
+                    if not get_contacts:
+                        # 通过大纲直接取电话
+                        if len(new_split_list) > 1:
+                            for _start, _end in new_split_list:
+                                temp_sentence = _content[_start:_end]
+                                sentence_outline = temp_sentence.split(",::")[0]
+                                if re.search("联系人|联系方|联系方式|联系电话|电话|负责人", sentence_outline):
+                                    sentence_phone = phone.findall(temp_sentence)
+                                    if sentence_phone:
+                                        PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
+                                        get_contacts = True
+                                        break
+                    if not get_contacts:
+                        # 通过正则提取句子段落进行提取电话
+                        contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[::]?"
+                        tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}"
+                        contact_pattern_list = [tenderee_pattern + contacts_person,
+                                                "(?:采购[^。,]{0,2}项目|采购事项|招标)[^。,]{0,4}" + contacts_person,
+                                                "(?:项目|采购)[^。,]{0,4}" + contacts_person,
+                                                "(?:报名|报价|业务咨询|业务|投标咨询)[^。,]{0,4}" + contacts_person, ]
+                        for _pattern in contact_pattern_list:
+                            get_tenderee_contacts = False
+                            for regular_match in re.finditer(_pattern, _content):
+                                match_text = _content[regular_match.end():regular_match.end() + 40]
+                                match_text = match_text.split("。")[0]
+                                sentence_phone = phone.findall(match_text)
                                 if sentence_phone:
                                 if sentence_phone:
                                     PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
                                     PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
+                                    get_tenderee_contacts = True
                                     break
                                     break
-    # 公告中只有"招标人",无"联系人"链接且上一条规则无效时时,通过正则提取句子段落进行提取电话
-    if len(PackDict)==1:
-        k = list(PackDict.keys())[0]
-        if len(PackDict[k]["roleList"])==1:
-            if PackDict[k]["roleList"][0].role_name == "tenderee":
-                if not PackDict[k]["roleList"][0].linklist:
-                    contacts_person = "(?:联系人|联系方|联系方式|负责人|电话|联系电话)[::]?"
-                    tenderee_pattern = "(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主|业主单位)[^。]{0,5}"
-                    contact_pattern_list = [tenderee_pattern + contacts_person,"(?:采购[^。,]{0,2}项目|采购事项|招标)[^。,]{0,4}" + contacts_person,
-                                            "(?:项目|采购)[^。,]{0,4}"+contacts_person,"(?:报名|报价|业务咨询|业务|投标咨询)[^。,]{0,4}"+contacts_person,]
-                    for _pattern in contact_pattern_list:
-                        get_tenderee_contacts = False
-                        for regular_match in re.finditer(_pattern,_content):
-                            match_text = _content[regular_match.end():regular_match.end()+40]
-                            match_text = match_text.split("。")[0]
-                            sentence_phone = phone.findall(match_text)
-                            if sentence_phone:
-                                PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
-                                get_tenderee_contacts = True
+                            if get_tenderee_contacts:
                                 break
                                 break
-                        if get_tenderee_contacts:
-                            break
 
 
     for pack in PackDict.keys():
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):
         for i in range(len(PackDict[pack]["roleList"])):
             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
             PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()
-
-    return PackDict 
+    return PackDict
 
 
 def initPackageAttr(RoleList,PackageSet):
 def initPackageAttr(RoleList,PackageSet):
     '''
     '''
@@ -2419,7 +2469,7 @@ def initPackageAttr(RoleList,PackageSet):
         packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
         packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称,实体名称,角色阈值,金额,金额阈值,连接列表,金额单位)
     return packDict
     return packDict
                 
                 
-def getPackageRoleMoney(list_sentence,list_entity):
+def getPackageRoleMoney(list_sentence,list_entity,list_outline):
     '''
     '''
     @param:
     @param:
         list_sentence:文章的句子list
         list_sentence:文章的句子list
@@ -2435,11 +2485,9 @@ def getPackageRoleMoney(list_sentence,list_entity):
     for item in PackageList:
     for item in PackageList:
         # print(item)
         # print(item)
     '''
     '''
-    # print("=2")
     PackDict = initPackageAttr(RoleList, PackageSet)
     PackDict = initPackageAttr(RoleList, PackageSet)
-    # print("=3")
-    PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity, list_sentence)
-    # print("=4")
+
+    PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_sentence, list_entity, list_outline)
     return PackDict
     return PackDict
 
 
 def turnBidWay(bidway):
 def turnBidWay(bidway):
@@ -2717,7 +2765,7 @@ def getOtherAttributes(list_entity):
 def getMoneyRange(RoleList):
 def getMoneyRange(RoleList):
     pass
     pass
 
 
-def getPREMs(list_sentences,list_entitys,list_articles):
+def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
     '''
     '''
     @param:
     @param:
         list_sentence:所有文章的句子list
         list_sentence:所有文章的句子list
@@ -2725,8 +2773,8 @@ def getPREMs(list_sentences,list_entitys,list_articles):
     @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话  
     @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话  
     '''
     '''
     result = []
     result = []
-    for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
-        RoleList = getPackageRoleMoney(list_sentence,list_entity)
+    for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
+        RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
         result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
         result.append(dict({"prem":RoleList,"docid":list_article.doc_id},**getOtherAttributes(list_entity),**getTimeAttributes(list_entity,list_sentence),
                            **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
                            **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
                               "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
                               "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),

+ 16 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -259,7 +259,22 @@ class Model_relation_extraction():
         return text_data, pre_data
         return text_data, pre_data
 
 
     def predict(self,text_in, words, rate=0.5):
     def predict(self,text_in, words, rate=0.5):
-        # text_words = text_in
+        # 没有需要预测的链接属性,直接return
+        company_relation = 0
+        person_relation = 0
+        if '<company/org>' in words:
+            company_relation += 1
+        if '<contact_person>' in words:
+            person_relation += 1
+            if company_relation:
+                company_relation += 1
+        if '<location>' in words and company_relation:
+            company_relation += 1
+        if '<phone>' in words and company_relation:
+            person_relation += 1
+        if company_relation < 2 and person_relation < 2:
+            return []
+        # 使用模型预测
         triple_list = []
         triple_list = []
         # print("tokens:",words)
         # print("tokens:",words)
         # _t2 = [self.words2id.get(c, 1) for c in words]
         # _t2 = [self.words2id.get(c, 1) for c in words]

+ 5 - 5
BiddingKG/dl/interface/predictor.py

@@ -334,7 +334,7 @@ class CodeNamePredict():
                         end = iter.span()[1]+get_len
                         end = iter.span()[1]+get_len
                         code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
                         code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
                         code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
                         code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
-                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]],entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1])
+                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]],entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
                         temp_entitys.append(_entity)
                         temp_entitys.append(_entity)
                     #print("code",code_text)
                     #print("code",code_text)
                     if len(code_x)>0:
                     if len(code_x)>0:
@@ -381,7 +381,7 @@ class CodeNamePredict():
                         _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
                         _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
 
 
                         #add name to entitys
                         #add name to entitys
-                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1])
+                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
                         list_entity.append(_entity)
                         list_entity.append(_entity)
                         w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
                         w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
                         if _name not in dict_name_freq_score:
                         if _name not in dict_name_freq_score:
@@ -419,7 +419,7 @@ class CodeNamePredict():
                         _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
                         _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
                         sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
                         sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
                                          entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
                                          entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
-                                         end_index=0, wordOffset_begin=beg, wordOffset_end=end)
+                                         end_index=0, wordOffset_begin=beg, wordOffset_end=end,in_attachment=sentence.in_attachment)
                         list_entity.append(_entity)
                         list_entity.append(_entity)
                         w = 1
                         w = 1
                         if _name not in dict_name_freq_score:
                         if _name not in dict_name_freq_score:
@@ -990,7 +990,7 @@ class EPCPredict():
                         continue
                         continue
                     if re.search("[.,]\d{2,}",phone_right):
                     if re.search("[.,]\d{2,}",phone_right):
                         continue
                         continue
-                    _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2])
+                    _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2],in_attachment=_sentence.in_attachment)
                     phone_entitys.append(_entity)
                     phone_entitys.append(_entity)
             person_entitys = []
             person_entitys = []
             for entity in list_entity:
             for entity in list_entity:
@@ -1644,7 +1644,7 @@ class ProductPredictor():
                                                  entity_text=sentence.sentence_text[start:end],
                                                  entity_text=sentence.sentence_text[start:end],
                                                  entity_type="product", sentence_index=sentence.sentence_index,
                                                  entity_type="product", sentence_index=sentence.sentence_index,
                                                  begin_index=0, end_index=0, wordOffset_begin=start,
                                                  begin_index=0, end_index=0, wordOffset_begin=start,
-                                                 wordOffset_end=end)
+                                                 wordOffset_end=end,in_attachment=sentence.in_attachment)
                                 list_entity.append(_entity)
                                 list_entity.append(_entity)
                                 temp_list.append(sentence.sentence_text[start:end])
                                 temp_list.append(sentence.sentence_text[start:end])
                         # item["product"] = list(set(temp_list))
                         # item["product"] = list(set(temp_list))

+ 3 - 1
BiddingKG/dl/ratio/re_ratio.py

@@ -1,6 +1,8 @@
 import re
 import re
 
 
-ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
+# ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
+ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)费率|折扣率)([((]?%[))]?|)[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
+# ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)费率|折扣率)([((]?%[))]?|)[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)')
 
 
 # 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%,
 # 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%,
 # 基准利率上浮率:百分之三十(30%)
 # 基准利率上浮率:百分之三十(30%)

+ 19 - 16
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -69,19 +69,21 @@ class MyEncoder(json.JSONEncoder):
 
 
 
 
 def predict(doc_id,text):
 def predict(doc_id,text):
-    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","",""]],useselffool=True)
+    list_articles,list_sentences,list_entitys,list_outlines,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","",""]],useselffool=True)
     for articles in list_articles:
     for articles in list_articles:
         print('预处理后文本信息')
         print('预处理后文本信息')
         print(articles.content)
         print(articles.content)
-    for sentences in list_sentences:
-        for sentence in sentences:
-            print(sentence.tokens)
+    # for sentences in list_sentences:
+    #     for sentence in sentences:
+    #         # print(sentence.sentence_index,sentence.tokens)
+    #         print(sentence.sentence_index,sentence.in_attachment,sentence.tokens)
+    print("location:",[ent.entity_text for ent in list_entitys[0] if ent.entity_type=='location'])
 
 
 
 
     ''''''
     ''''''
         
         
     codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
     codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
-    # print(codeName)
+    print('codeName',codeName)
     premPredict.predict(list_sentences,list_entitys)
     premPredict.predict(list_sentences,list_entitys)
     # for entitys in list_entitys:
     # for entitys in list_entitys:
     #     for entity in entitys:
     #     for entity in entitys:
@@ -98,7 +100,7 @@ def predict(doc_id,text):
     # print("entityLink")
     # print("entityLink")
     entityLink.link_entitys(list_entitys)
     entityLink.link_entitys(list_entitys)
     # print("getPREMs")
     # print("getPREMs")
-    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
     # print("getPREMs")
     # print("getPREMs")
     print("公司——联系人:", end=' ')
     print("公司——联系人:", end=' ')
     print(prem[0])
     print(prem[0])
@@ -129,12 +131,12 @@ def predict(doc_id,text):
                 # print(entity.begin_index, entity.end_index)
                 # print(entity.begin_index, entity.end_index)
                 print(entity.sentence_index)
                 print(entity.sentence_index)
                 pass
                 pass
-            elif entity.entity_type=="time":
-                print("time:",end=" ")
-                print(entity.entity_text, entity.label, entity.values)
-            elif entity.entity_type=="email":
-                print("email:",end=" ")
-                print(entity.entity_text, entity.begin_index, entity.end_index)
+            # elif entity.entity_type=="time":
+            #     print("time:",end=" ")
+            #     print(entity.entity_text, entity.label, entity.values)
+            # elif entity.entity_type=="email":
+            #     print("email:",end=" ")
+            #     print(entity.entity_text, entity.begin_index, entity.end_index)
             elif entity.entity_type in ['org','company']:
             elif entity.entity_type in ['org','company']:
                 _sentence = list_sentences[0][entity.sentence_index]
                 _sentence = list_sentences[0][entity.sentence_index]
                 print(entity.entity_type)
                 print(entity.entity_type)
@@ -174,7 +176,8 @@ def predict(doc_id,text):
             # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
             # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
 
 
     #print(prem)
     #print(prem)
-    return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+    # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+    return json.dumps(prem[0],cls=MyEncoder,sort_keys=True,indent=2,ensure_ascii=False)
 
 
          
          
 # def test(name,content):
 # def test(name,content):
@@ -318,7 +321,7 @@ def predict_fromdb(docid, dbname="sys_document_23"):
     # text = '竟然很明显的表达没识别为代理,代理机构名称:国信国采(北京)招标咨询有限责任公司,代理机构地址:北京市海淀区首体南路22号国兴大厦11层,  1.采购人信息名 称:北京市植物园。'
     # text = '竟然很明显的表达没识别为代理,代理机构名称:国信国采(北京)招标咨询有限责任公司,代理机构地址:北京市海淀区首体南路22号国兴大厦11层,  1.采购人信息名 称:北京市植物园。'
     list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],useselffool=True)
     list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],useselffool=True)
     codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
     codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
-    # print(codeName)
+    print('codeName',codeName)
     premPredict.predict(list_sentences, list_entitys)
     premPredict.predict(list_sentences, list_entitys)
     roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
     roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
     # print("epcPredict")
     # print("epcPredict")
@@ -447,7 +450,6 @@ if __name__=="__main__":
     # 传真:0769-81216222,邮编:523000。(三),采购人:东莞市道滘镇教育管理中心,地址:广东省东莞市道滘镇花园街1号,联系人:李先生,联系电话:0769-81332303,传真:/,邮编:523000,各有关当事人对中标、成交结果有异议的,可以在中标、成交公告发布之日起7个工作日内以书面形式向(政府采购代理机构)(或采购人)提出质疑,逾期将依法不予受理,'''
     # 传真:0769-81216222,邮编:523000。(三),采购人:东莞市道滘镇教育管理中心,地址:广东省东莞市道滘镇花园街1号,联系人:李先生,联系电话:0769-81332303,传真:/,邮编:523000,各有关当事人对中标、成交结果有异议的,可以在中标、成交公告发布之日起7个工作日内以书面形式向(政府采购代理机构)(或采购人)提出质疑,逾期将依法不予受理,'''
     text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
     text = codecs.open("C:\\Users\\Administrator\\Desktop\\test12354.txt", "r", encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div", id="pcontent"))
     content = str(BeautifulSoup(text).find("div", id="pcontent"))
-    from BiddingKG.dl.interface.Preprocessing import tableToText
     # print("tableToText:",tableToText(BeautifulSoup(re.sub("<html>|</html>|<body>|</body>","",content),"lxml")))
     # print("tableToText:",tableToText(BeautifulSoup(re.sub("<html>|</html>|<body>|</body>","",content),"lxml")))
 #     text = '''
 #     text = '''
 # 采购代理机构:山东立行建设项目管理有限公司地址:山东省临沂市兰山县(区)柳青广州路与蒙河路交汇大官苑社区西沿街A区三楼南侧号,联系方式:17862288900,
 # 采购代理机构:山东立行建设项目管理有限公司地址:山东省临沂市兰山县(区)柳青广州路与蒙河路交汇大官苑社区西沿街A区三楼南侧号,联系方式:17862288900,
@@ -459,7 +461,8 @@ if __name__=="__main__":
     a = time.time()
     a = time.time()
     print("start")
     print("start")
     # print(predict("12",content))
     # print(predict("12",content))
-    result = predict("12",text)
+    print(predict("12",text))
+    # result = predict("12",text)
     # result = predict("12",content)
     # result = predict("12",content)
     # print(json.loads(result))
     # print(json.loads(result))
     #test("12",text)
     #test("12",text)

+ 10 - 4
BiddingKG/dl/time/re_servicetime.py

@@ -56,7 +56,7 @@ def re_serviceTime(text):
                          u'|工期、)'
                          u'|工期、)'
                          u'|工期情况|划工期内|服务期内')
                          u'|工期情况|划工期内|服务期内')
 
 
-    reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,|服务期限应按'
+    reg_not1 = re.compile(u'(履行日期:见|服务期限应按|签订合同前,'
                           u'|务期限:1、|同签订日期:|证金在合同签|服务期限截止'
                           u'|务期限:1、|同签订日期:|证金在合同签|服务期限截止'
                           u')')
                           u')')
 
 
@@ -68,9 +68,15 @@ def re_serviceTime(text):
         input_str = text_list[index]
         input_str = text_list[index]
 
 
         # 替换混淆词
         # 替换混淆词
-        input_str = re.sub(reg_not, "####", input_str)
-        input_str = re.sub(reg_not1, "######", input_str)
-        input_str = re.sub(reg_not2, "##", input_str)
+        # input_str = re.sub(reg_not, "####", input_str)
+        # input_str = re.sub(reg_not1, "######", input_str)
+        # input_str = re.sub(reg_not2, "##", input_str)
+        for _reg_not in [reg_not,reg_not1,reg_not2]:
+            match = re.findall(_reg_not, input_str)
+            if match:
+                for word in match:
+                    instead = "#" * len(word)
+                    input_str = re.sub(word, instead, input_str)
 
 
         output_str, text_index = re_findAllResult(reg2, input_str)
         output_str, text_index = re_findAllResult(reg2, input_str)
         if len(text_index) == 0:
         if len(text_index) == 0: