Jelajahi Sumber

招标人召回规则添加

znj 3 tahun lalu
induk
melakukan
ef1700315f

+ 17 - 0
BiddingKG/dl/entityLink/entityLink.py

@@ -122,6 +122,23 @@ def link_entitys(list_entitys,on_value=0.81):
                             used_linked_entitys.append(_ent)
                             # print(_entity.entity_text, _entity.if_dict_match, _ent.entity_text, _ent.if_dict_match)
 
+def doctitle_refine(doctitle):
+    _doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|'
+                             r'交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|竞价|合同', '', doctitle)
+    return _doctitle_refine
+# 前100个公司实体
+def get_nlp_enterprise(list_entity):
+    count = 0
+    nlp_enterprise = []
+    list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index))
+    for entity in list_entity:
+        if entity.entity_type in ['org','company']:
+            if entity.entity_text not in nlp_enterprise:
+                nlp_enterprise.append(entity.entity_text)
+                count += 1
+                if count>=100:
+                    break
+    return nlp_enterprise
 
 def getEnterprisePath():
     filename = "LEGAL_ENTERPRISE.txt"

+ 173 - 22
BiddingKG/dl/interface/Preprocessing.py

@@ -1074,7 +1074,7 @@ def get_preprocessed_outline(soup):
             deal_part = body_child[0]
     if len(deal_part.find_all(recursive=False))>2:
         deal_part = deal_part.parent
-    skip_tag = ['turntable', 'tbody', 'th', 'tr', 'td', 'table','<thead>','<tfoot>']
+    skip_tag = ['turntable', 'tbody', 'th', 'tr', 'td', 'table','thead','tfoot']
     for part in deal_part.find_all(recursive=False):
         # 查找解析文本的主干部分
         is_main_text = False
@@ -1172,7 +1172,7 @@ def segment(soup,final=True):
             # text = re.sub("\s+","##space##",text)
             return text
     segList = ["title"]
-    commaList = ["div","br","td","p"]
+    commaList = ["div","br","td","p","li"]
     #commaList = []
     spaceList = ["span"]
 
@@ -1181,15 +1181,15 @@ def segment(soup,final=True):
         tbodies = soup.find_all('table')
     # 递归遍历所有节点,插入符号
     for child in soup.find_all(recursive=True):
-
+        # print(child.name,child.get_text())
         if child.name in segList:
             child.insert_after("。")
         if child.name in commaList:
             child.insert_after(",")
-        if child.name == 'div' and 'class' in child.attrs:
-            # 添加附件"attachment"标识
-            if "richTextFetch" in child['class']:
-                child.insert_before("##attachment##")
+        # if child.name == 'div' and 'class' in child.attrs:
+        #     # 添加附件"attachment"标识
+        #     if "richTextFetch" in child['class']:
+        #         child.insert_before("##attachment##")
                 # print(child.parent)
         # if child.name in subspaceList:
         #     child.insert_before("#subs"+str(child.name)+"#")
@@ -1197,7 +1197,6 @@ def segment(soup,final=True):
         # if child.name in spaceList:
         #     child.insert_after(" ")
     text = str(soup.get_text())
-
     #替换英文冒号为中文冒号
     text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
     #替换为中文逗号
@@ -1249,7 +1248,10 @@ def segment(soup,final=True):
         if len(punc_del)>1:
             if len(punc_del.strip())>0:
                 if ":" in punc_del.strip():
-                    text = re.sub(punc_del,":",text)
+                    if "。" in punc_del.strip():
+                        text = re.sub(punc_del, ":。", text)
+                    else:
+                        text = re.sub(punc_del,":",text)
                 else:
                     text = re.sub(punc_del,punc_del.strip()[0],text)   #2021/12/09 修正由于某些标签后插入符号把原来符号替换
             else:
@@ -1698,6 +1700,79 @@ def special_treatment(sourceContent, web_source_no):
             sourceContent = sourceContent.replace('支付金额:', '合同金额:')
     return sourceContent
 
+def article_limit(soup,limit_words=30000):
+    sub_space = re.compile("\s+")
+    def soup_limit(_soup,_count,max_count=30000,max_gap=500):
+        """
+        :param _soup: soup
+        :param _count: 当前字数
+        :param max_count: 字数最大限制
+        :param max_gap: 超过限制后的最大误差
+        :return:
+        """
+        _gap = _count - max_count
+        _is_skip = False
+        next_soup = None
+        while len(_soup.find_all(recursive=False)) == 1 and \
+                _soup.get_text(strip=True) == _soup.find_all(recursive=False)[0].get_text(strip=True):
+            _soup = _soup.find_all(recursive=False)[0]
+        try:
+            for _soup_part in _soup.find_all(recursive=False):
+                if not _is_skip:
+                    _count += len(re.sub(sub_space, "", _soup_part.get_text()))
+                    if _count >= max_count:
+                        _gap = _count - max_count
+                        if _gap <= max_gap:
+                            _is_skip = True
+                        else:
+                            next_soup = _soup_part
+                            _count -= len(re.sub(sub_space, "", _soup_part.get_text()))
+                            break
+                else:
+                    _soup_part.decompose()
+        except:
+            return _count,_gap,None
+        return _count,_gap,next_soup
+
+    text_count = 0
+    have_attachment = False
+    attachment_part = None
+    for child in soup.find_all(recursive=True):
+        if child.name == 'div' and 'class' in child.attrs:
+            if "richTextFetch" in child['class']:
+                child.insert_before("##attachment##")
+                attachment_part = child
+                have_attachment = True
+                break
+    if not have_attachment:
+        # 无附件
+        if len(re.sub(sub_space, "", soup.get_text())) > limit_words:
+            text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=500)
+            while n_soup:
+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
+    else:
+        # 有附件
+        _text = re.sub(sub_space, "", soup.get_text())
+        _text_split = _text.split("##attachment##")
+        if len(_text_split[0])>limit_words:
+            main_soup = attachment_part.parent
+            main_text = main_soup.find_all(recursive=False)[0]
+            text_count, gap, n_soup = soup_limit(main_text, text_count, max_count=limit_words, max_gap=500)
+            while n_soup:
+                text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
+        if len(_text_split[1])>limit_words:
+            attachment_text_nums = 0
+            attachment_skip = False
+            for part in attachment_part.find_all(recursive=False):
+                if not attachment_skip:
+                    attachment_text_nums += len(re.sub(sub_space, "", part.get_text()))
+                    if attachment_text_nums>=limit_words:
+                        attachment_skip = True
+                else:
+                    part.decompose()
+
+    return soup
+
 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
     '''
     :param articles: 待处理的article source html
@@ -1712,14 +1787,13 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
         sourceContent = sourceContent.replace('<br/>', '<br>')
         sourceContent = re.sub("<br>(\s{0,}<br>)+","<br>",sourceContent)
-        for br_match in re.findall("[^>]+?<br>",sourceContent):
-            _new = re.sub("<br>","",br_match)
-            # <br>标签替换为<p>标签
-            if not re.search("^\s+$",_new):
-                _new = '<p>'+_new + '</p>'
-                # print(br_match,_new)
-                sourceContent = sourceContent.replace(br_match,_new,1)
-
+        # for br_match in re.findall("[^>]+?<br>",sourceContent):
+        #     _new = re.sub("<br>","",br_match)
+        #     # <br>标签替换为<p>标签
+        #     if not re.search("^\s+$",_new):
+        #         _new = '<p>'+_new + '</p>'
+        #         # print(br_match,_new)
+        #         sourceContent = sourceContent.replace(br_match,_new,1)
         _send_doc_id = article[3]
         _title = article[4]
         page_time = article[5]
@@ -1737,11 +1811,13 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         if web_source_no in ["00753-14","DX008357-11","18021-2"]:
             article_processed = special_treatment(article_processed, web_source_no)
         for _soup in article_processed.descendants:
-            # 识别无标签文本,添加<p>标签
+            # 识别无标签文本,添加<span>标签
             if not _soup.name and not _soup.parent.string and _soup.string.strip()!="":
                 # print(_soup.parent.string,_soup.string.strip())
-                _soup.wrap(article_processed.new_tag("p"))
+                _soup.wrap(article_processed.new_tag("span"))
         # print(article_processed)
+        # 正文和附件内容限制字数30000
+        article_processed = article_limit(article_processed,limit_words=30000)
         article_processed = get_preprocessed_outline(article_processed)
         article_processed = tableToText(article_processed)
         # print(article_processed)
@@ -1749,6 +1825,18 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
+        # 修复OCR金额中“,”、“。”识别错误
+        article_processed_list = article_processed.split("##attachment##")
+        if len(article_processed_list)>1:
+            attachment_text = article_processed_list[1]
+            for _match in re.finditer("\d。\d{2}",attachment_text):
+                _match_text = _match.group()
+                attachment_text = attachment_text.replace(_match_text,_match_text.replace("。","."),1)
+            for _match in re.finditer("(\d,\d{3})[,,.]",attachment_text):
+                _match_text = _match.group()
+                attachment_text = attachment_text.replace(_match_text,_match_text.replace(",",","),1)
+            article_processed_list[1] = attachment_text
+            article_processed = "##attachment##".join(article_processed_list)
         '''特别数据源对 预处理后文本 做特别修改'''
         if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2"]:
             article_processed = special_treatment(article_processed, web_source_no)
@@ -1970,7 +2058,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             cost_time[key_nerToken] = 0
         cost_time[key_nerToken] += round(time.time()-start_time,2)
 
-
+        company_dict = set()
+        company_index = dict((i,set()) for i in range(len(list_sentence)))
         for sentence_index in range(len(list_sentence)):
             list_sentence_entitys = []
 
@@ -2022,6 +2111,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
                     ner_entitys.append((b, e, 'org', entity))
 
+            for ner_entity in ner_entitys:
+                if ner_entity[2] in ['company','org']:
+                    company_dict.add((ner_entity[2],ner_entity[3]))
+                    company_index[sentence_index].add((ner_entity[0],ner_entity[1]))
             #识别package
 
             #识别实体
@@ -2175,10 +2268,11 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                                 filter = v
                             if re.search("filter_unit",k) is not None:
                                 filter_unit = True
-
+                    # print(_match.group())
+                    # print(entity_text,unit,text_beforeMoney,filter,filter_unit)
                     if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
                         if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0]-2):_match.span()[0]]):
-                            entity_text = re.sub('\d+,', '', entity_text)
+                             entity_text = re.sub('\d+,', '', entity_text)
                         else:
                             entity_text = entity_text.replace(',', '.')
                         # print(' 修正OCR识别小数点为逗号')
@@ -2290,6 +2384,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
                             list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
                             # print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
+                            # print(entity_text,unit,notes)
+
                 else:
                     index += 1
 
@@ -2456,6 +2552,61 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             list_sentence_entitys.sort(key=lambda x:x.begin_index)
             list_entitys_temp = list_entitys_temp+list_sentence_entitys
+        # 补充ner模型未识别全的company/org实体
+        for sentence_index in range(len(list_sentence)):
+            sentence_text = list_sentence[sentence_index].sentence_text
+            tokens = list_sentence[sentence_index].tokens
+            doc_id = list_sentence[sentence_index].doc_id
+            in_attachment = list_sentence[sentence_index].in_attachment
+            list_tokenbegin = []
+            begin = 0
+            for i in range(0, len(tokens)):
+                list_tokenbegin.append(begin)
+                begin += len(str(tokens[i]))
+            list_tokenbegin.append(begin + 1)
+            add_sentence_entitys = []
+            company_dict = sorted(list(company_dict),key=lambda x:len(x[1]),reverse=True)
+            for company_type,company_text in company_dict:
+                begin_index_list = findAllIndex(company_text,sentence_text)
+                for begin_index in begin_index_list:
+                    is_continue = False
+                    for t_begin,t_end in list(company_index[sentence_index]):
+                        if begin_index>=t_begin and begin_index+len(company_text)<=t_end:
+                            is_continue = True
+                            break
+                    if not is_continue:
+                        add_sentence_entitys.append((begin_index,begin_index+len(company_text),company_type,company_text))
+                        company_index[sentence_index].add((begin_index,begin_index+len(company_text)))
+                    else:
+                        continue
+            for ner_entity in add_sentence_entitys:
+                begin_index_temp = ner_entity[0]
+                end_index_temp = ner_entity[1]
+                entity_type = ner_entity[2]
+                entity_text = ner_entity[3]
+
+                if entity_type in ["org","company"] and not isLegalEnterprise(entity_text):
+                    continue
+
+                for j in range(len(list_tokenbegin)):
+                    if list_tokenbegin[j]==begin_index_temp:
+                        begin_index = j
+                        break
+                    elif list_tokenbegin[j]>begin_index_temp:
+                        begin_index = j-1
+                        break
+                begin_index_temp += len(str(entity_text))
+                for j in range(begin_index,len(list_tokenbegin)):
+                    if list_tokenbegin[j]>=begin_index_temp:
+                        end_index = j-1
+                        break
+                entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
+
+                #去掉标点符号
+                entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
+                entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
+                list_entitys_temp.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
+        list_entitys_temp.sort(key=lambda x:(x.sentence_index,x.begin_index))
         list_entitys.append(list_entitys_temp)
     return list_entitys
     

+ 14 - 3
BiddingKG/dl/interface/extract.py

@@ -85,9 +85,13 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
 
     '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
     start_time = time.time() #正则角色提取
-    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys, codeName)
+    predictor.getPredictor("roleRuleFinal").predict(list_articles,list_sentences,list_entitys, codeName)
     cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
 
+    start_time = time.time() #正则招标人召回
+    predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
+    cost_time["tendereeRuleRecall"] = round(time.time()-start_time,2)
+
     start_time = time.time() #联系人模型提取
     predictor.getPredictor("epc").predict(list_sentences,list_entitys)
     log("get epc done of doc_id%s"%(doc_id))
@@ -124,6 +128,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
     # 依赖句子顺序
     start_time = time.time()  # 实体链接
     entityLink.link_entitys(list_entitys)
+    doctitle_refine = entityLink.doctitle_refine(title)
+    nlp_enterprise = entityLink.get_nlp_enterprise(list_entitys[0])
     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles,list_outlines)
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
@@ -143,8 +149,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
         for value in prem[0]['prem'].values():
             for l in value['roleList']:
                 try:
-                    if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
-                        l[2] = total_product_money
+                    # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
+                    #     l[2] = total_product_money
+                    #     log('修改中标金额为所有产品总金额')
+                    if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money:
+                        l["role_money"]['money'] = total_product_money
                         log('修改中标金额为所有产品总金额')
                 except Exception as e:
                     log('表格产品价格修正中标价格报错:%s'%e)
@@ -158,6 +167,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason)
+    data_res["doctitle_refine"] = doctitle_refine
+    data_res["nlp_enterprise"] = nlp_enterprise
     data_res["cost_time"] = cost_time
     data_res["success"] = True
 

+ 107 - 10
BiddingKG/dl/interface/getAttributes.py

@@ -437,13 +437,17 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
         return None
     PackageList,PackageSet,dict_PackageCode = pack
 
-
     #拿到所有可能的情况
     dict_role_combination = {}
   # print(PackageList)
     #拿到各个实体的packageName,packageCode
     for entity in list_entity:
         if entity.entity_type in ['org','company']:
+            #限制附件里角色values[label]最大概率prob
+            max_prob = 0.85
+            if str(entity.label)!="5" and entity.in_attachment:
+                if entity.values[entity.label]>max_prob:
+                    entity.values[entity.label] = max_prob
             #过滤掉字数小于3个的实体
             if len(entity.entity_text)<=3:
                 continue
@@ -1022,6 +1026,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
         last_tokens_num = len(sentence.tokens)
     attribute_type = ['money','serviceTime','ratio']# 'money'仅指“中投标金额”
+    # print([i.entity_text for i in list_entity if i.entity_type=='money'])
     for link_attribute in attribute_type:
         temp_entity_list = []
         if link_attribute=="money":
@@ -1040,6 +1045,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 drop_tendererMoney.append(next_entity)
             for _drop in drop_tendererMoney:
                 temp_entity_list.remove(_drop)
+            # print([i.entity_text for i in temp_entity_list])
         elif link_attribute=="serviceTime":
             temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
                                 ent.entity_type=='serviceTime']
@@ -1106,6 +1112,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 _entity.pointer_money = _attribute
                 packagePointer, _ = getPackage(PackageList, _attribute.sentence_index, _attribute.begin_index,
                                                "money-" + str(_attribute.entity_text) + "-" + str(_attribute.label))
+                # print(_entity.entity_text,_attribute.entity_text)
                 if packagePointer is None:
                     packageName_entity = "Project"
                 else:
@@ -1239,7 +1246,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
                        '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
                        '[2-9]\d{6,7}')
-    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
+    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
     email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
                             "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
     phone_entitys = []
@@ -1357,11 +1364,29 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     else:
         # 公告大于maxlen时,分段预测
         start = 0
+        # print("len(pre_data)",len(pre_data))
+        temp_data = []
+        deal_data = 0
         while start<len(pre_data):
             _pre_data = pre_data[start:start+maxlen]
             _text_data = text_data[start:start+maxlen]
-            relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
+            if relationExtraction_model.check_data(_pre_data):
+                temp_data.append((_text_data,_pre_data))
+            else:
+                if temp_data:
+                    deal_data += len(temp_data)
+                    if deal_data>3:
+                        break
+                    for _text_data, _pre_data in temp_data:
+                        relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
+                    temp_data = []
             start = start + maxlen - 120
+        # print("预测数据:",len(temp_data))
+        # if len(temp_data)<=6:
+        #     for _text_data,_pre_data in temp_data:
+        #         relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
+        # else:
+        #     relation_list = []
         # 去重结果
         relation_list = list(set(relation_list))
     # print(relation_list)
@@ -1377,6 +1402,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
     linked_company = set()
     linked_person = set()
+    linked_connetPerson = set()
+    linked_phone = set()
     for predicate in ["rel_address","rel_phone","rel_person"]:
         _match_list = []
         _match_combo = []
@@ -1444,6 +1471,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 break
                 if is_continue: continue
                 combo[0].person_phone.append(combo[1])
+                linked_connetPerson.add(combo[0])
+                linked_phone.add(combo[1])
                 if combo[0].label in [1,2]:
                     if PackDict.get("Project"):
                         for i in range(len(PackDict["Project"]["roleList"])):
@@ -1452,6 +1481,68 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
                                 break
                 # print(3,combo[0].entity_text,combo[1].entity_text)
+    # "联系人——联系电话" 链接规则补充
+    person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
+    person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
+    t_match_list = []
+    for ent_idx in range(len(person_phone_EntityList)):
+        entity = person_phone_EntityList[ent_idx]
+        if entity.entity_type=="person":
+            match_nums = 0
+            person_nums = 0  # 经过其他中联系人的数量
+            byNotPerson_match_nums = 0  # 跟在联系人后面的属性
+            phone_nums = 0 # 经过电话的数量
+            for after_index in range(ent_idx + 1, min(len(person_phone_EntityList), ent_idx + 8)):
+                after_entity = person_phone_EntityList[after_index]
+                if after_entity.entity_type == "phone":
+                    distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                            tokens_num_dict[entity.sentence_index] + entity.end_index)
+                    phone_nums += 1
+                    if distance>100 or phone_nums>=4:
+                        break
+                    sentence_distance = after_entity.sentence_index - entity.sentence_index
+                    value = (-1 / 2 * (distance ** 2)) / 10000
+                    if sentence_distance == 0:
+                        if distance < 80:
+                            # value = (-1 / 2 * (distance ** 2)) / 10000
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if not person_nums:
+                                byNotPerson_match_nums += 1
+                            else:
+                                break
+                    else:
+                        if distance < 50:
+                            # value = (-1 / 2 * (distance ** 2)) / 10000
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if not person_nums:
+                                byNotPerson_match_nums += 1
+                            else:
+                                break
+                else:
+                    person_nums += 1
+            # 前向查找属性
+            if ent_idx != 0 and (not match_nums or not byNotPerson_match_nums):
+                previous_entity = person_phone_EntityList[ent_idx - 1]
+                if previous_entity.entity_type == 'phone':
+                    # if previous_entity.sentence_index == entity.sentence_index:
+                    distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
+                            tokens_num_dict[previous_entity.sentence_index] + previous_entity.end_index)
+                    if distance < 40:
+                        # 前向 没有 /10000
+                        value = (-1 / 2 * (distance ** 2))
+                        t_match_list.append(Match(entity, previous_entity, value))
+    # km算法分配求解(person-phone)
+    t_match_list = [mat for mat in t_match_list if mat.main_role not in linked_connetPerson and mat.attribute not in linked_phone]
+    personphone_result = dispatch(t_match_list)
+    personphone_result = sorted(personphone_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
+    for match in personphone_result:
+        _person = match[0]
+        _phone = match[1]
+        if not _person.person_phone:
+            _person.person_phone = []
+        _person.person_phone.append(_phone)
     # 多个招标人/代理人或者别称
     for idx in range(1,len(pre_entity)):
         _pre_entity = pre_entity[idx]
@@ -1852,7 +1943,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
         for k in PackDict.keys():
             for i in range(len(PackDict[k]["roleList"])):
                 if PackDict[k]["roleList"][i].role_name == "tenderee":
-                    if not PackDict[k]["roleList"][i].linklist:
+                    # if not PackDict[k]["roleList"][i].linklist:
                         if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0:
                             if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0 and person_ not in winter_contact:
                                 if not phone_:
@@ -1862,7 +1953,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                     PackDict[k]["roleList"][i].linklist.append((person_, p))
                                 is_update = True
                 elif PackDict[k]["roleList"][i].role_name == "agency":
-                    if not PackDict[k]["roleList"][i].linklist:
+                    # if not PackDict[k]["roleList"][i].linklist:
                         if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1 and person_ not in winter_contact:
                             if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0:
                                 if not phone_:
@@ -1895,7 +1986,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             for _person in company_entity.pointer_person:
                 linked_person.append(_person)
                 linked_persons_with.append(company_entity)
-
     # 一个公司对应多个联系人的补充
     person_entitys = [entity for entity in list_entity if entity.entity_type=='person']
     person_entitys = person_entitys[::-1]
@@ -2358,7 +2448,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
                     PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
                     # print('招标金额校正中标金额')
-
+    # 2022/04/01 #增加判断中标金额是否远小于招标金额逻辑,比例相差10000倍左右(中标金额“万”单位丢失或未识别)
+    for pack in PackDict.keys():
+        for i in range(len(PackDict[pack]["roleList"])):
+            if PackDict[pack]["tendereeMoney"] > 0 and float(PackDict[pack]["roleList"][i].money) > 0.:
+                if float(PackDict[pack]["roleList"][i].money) < 1000 and \
+                        float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)>=9995 and \
+                        float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)<11000:
+                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) * 10000
     # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
     for pack in PackDict.keys():
         tmp_moneys = []
@@ -2382,7 +2479,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     if not get_contacts:
                         # 根据大纲Outline类召回联系人
                         for outline in list_outline:
-                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人",outline.outline_summary):
+                            if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系",outline.outline_summary):
                                 for t_person in [p for p in temporary_list2 if p.entity_type=='person' and p.label==3]:
                                     if words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= words_num_dict[outline.sentence_begin_index] + outline.wordOffset_begin and words_num_dict[
                                         t_person.sentence_index] + t_person.wordOffset_end < words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
@@ -2417,12 +2514,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
                             get_contacts = True
                     if not get_contacts:
-                        # 通过大纲直接取电话
+                        # 通过大纲Outline类直接取电话
                         if len(new_split_list) > 1:
                             for _start, _end in new_split_list:
                                 temp_sentence = _content[_start:_end]
                                 sentence_outline = temp_sentence.split(",::")[0]
-                                if re.search("联系人|联系方|联系方式|联系电话|电话|负责人", sentence_outline):
+                                if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
                                     sentence_phone = phone.findall(temp_sentence)
                                     if sentence_phone:
                                         PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))

+ 12 - 4
BiddingKG/dl/interface/modelFactory.py

@@ -258,7 +258,8 @@ class Model_relation_extraction():
             last_sentence_index = key
         return text_data, pre_data
 
-    def predict(self,text_in, words, rate=0.5):
+    def check_data(self, words):
+        # 检查数据是否包含可预测的subject和object
         # 没有需要预测的链接属性,直接return
         company_relation = 0
         person_relation = 0
@@ -268,12 +269,19 @@ class Model_relation_extraction():
             person_relation += 1
             if company_relation:
                 company_relation += 1
-        if '<location>' in words and company_relation:
-            company_relation += 1
+        # 暂时不考虑地址location实体
+        # if '<location>' in words and company_relation:
+        #     company_relation += 1
         if '<phone>' in words and company_relation:
             person_relation += 1
         if company_relation < 2 and person_relation < 2:
-            return []
+            return False
+        return True
+
+    def predict(self,text_in, words, rate=0.5):
+        # 没有需要预测的链接属性,直接return
+        # if self.check_data(words):
+        #     return []
         # 使用模型预测
         triple_list = []
         # print("tokens:",words)

+ 270 - 70
BiddingKG/dl/interface/predictor.py

@@ -32,6 +32,7 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
               "epc":{"predictor":None,"Lock":RLock()},
               "roleRule":{"predictor":None,"Lock":RLock()},
               "roleRuleFinal":{"predictor":None,"Lock":RLock()},
+              "tendereeRuleRecall":{"predictor":None,"Lock":RLock()},
                   "form":{"predictor":None,"Lock":RLock()},
                   "time":{"predictor":None,"Lock":RLock()},
                   "punish":{"predictor":None,"Lock":RLock()},
@@ -57,6 +58,8 @@ def getPredictor(_type):
                     dict_predictor[_type]["predictor"] = RoleRulePredictor()
                 if _type == "roleRuleFinal":
                     dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
+                if _type == "tendereeRuleRecall":
+                    dict_predictor[_type]["predictor"] = TendereeRuleRecall()
                 if _type == "form":
                     dict_predictor[_type]["predictor"] = FormPredictor()
                 if _type == "time":
@@ -332,9 +335,9 @@ class CodeNamePredict():
                         else:
                             begin = iter.span()[0]-get_len
                         end = iter.span()[1]+get_len
-                        code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
-                        code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
-                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]],entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
+                        code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
+                        code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""))
+                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
                         temp_entitys.append(_entity)
                     #print("code",code_text)
                     if len(code_x)>0:
@@ -1100,11 +1103,11 @@ class RoleRulePredictor():
     def __init__(self):
         # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
         self.pattern_tenderee_left = "(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
-                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
-                                "[))]?(信息[,:])?(名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
-        self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询价|评选|谈判|邀标|邀请|洽谈|约谈)" \
+                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
+                                "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
+        self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
                                      "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
-                                     "(名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
+                                     "(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托))"
         self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^现委托|^的\w{2,10}正在进行)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
@@ -1266,7 +1269,7 @@ class RoleRulePredictor():
                                                 if _v_group is not None and _v_group != "":
                                                     _role = _group.split("_")[0]
                                                     if _role == "tendereeORagency":   # 2022/3/9 新增不确定招标代理判断逻辑
-                                                        print('p_entity_sentenceindex:', p_entity.sentence_index)
+                                                        # print('p_entity_sentenceindex:', p_entity.sentence_index)
                                                         if p_entity.sentence_index>=1:  # 只在第一句进行这种模糊匹配
                                                             continue
                                                         if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
@@ -1279,8 +1282,7 @@ class RoleRulePredictor():
                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
                                                     if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|交易服务单位',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
-                                                                                                        list_spans[
-                                                                                                            0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
+                                                                                                        list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                                   "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
@@ -1385,70 +1387,96 @@ class RoleRulePredictor():
 
 '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
 class RoleRuleFinalAdd():
-    def predict(self, list_articles, list_entitys, list_codenames):
-        text_end = list_articles[0].content[-40:]
+    def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
+        # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
+        main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
+        end_tokens = []
+        for sentence in main_sentences[-5:]:
+            end_tokens.extend(sentence.tokens)
+        text_end = "".join(end_tokens[-30:])
+        # print(text_end)
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
-        sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
+        sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
         sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
-        sear_ent3 = re.search('(报名咨询|收货地点|送货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
-
-        if sear_ent or sear_ent2 or sear_ent3:
-            if sear_ent3:
-                ent_re = sear_ent3.group(2)
-            elif sear_ent2:
-                ent_re = sear_ent2.group(2)
-            else:
-                ent_re = sear_ent.group(1)
-            ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
-            tenderee_notfound = True
-            agency_notfound = True
-            ents = []
-            for ent in list_entitys[0]:
-                if ent.entity_type in ['org', 'company']:
-                    if ent.label == 0:
-                        tenderee_notfound = False
-                    elif ent.label == 1:
-                        agency_notfound = False
-                    elif ent.label == 5:
-                        ents.append(ent)
-            if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
-                                              or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
-                n = 0
-                for i in range(len(ents) - 1, -1, -1):
-                    n += 1
-                    if n > 3 and sear_ent: # 文章末尾角色加日期这种只找后三个实体
-                        break
-                    if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
-                        ents[i].label = 0
-                        ents[i].values[0] = 0.5
-                        # log('正则最后补充实体: %s'%(ent_re))
-                        break
-            elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
-                n = 0
-                for i in range(len(ents) - 1, -1, -1):
-                    n += 1
-                    if n > 3 and sear_ent:  # 文章末尾角色加日期这种只找后三个实体
-                        break
-                    if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
-                        ents[i].label = 1
-                        ents[i].values[1] = 0.5
-                        # log('正则最后补充实体: %s'%(ent_re))
-                        break
-
+        sear_ent3 = re.search('(报名咨询|[收送交]货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
+        sear_ent4 = re.search('(发布(?:人|单位|机构))[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
+        sear_list = [sear_ent4 , sear_ent3 , sear_ent2 , sear_ent]
+
+        tenderee_notfound = True
+        agency_notfound = True
+        ents = []
+        for ent in list_entitys[0]:
+            if ent.entity_type in ['org', 'company']:
+                if ent.label == 0:
+                    tenderee_notfound = False
+                elif ent.label == 1:
+                    agency_notfound = False
+                elif ent.label == 5:
+                    ents.append(ent)
+        if sear_ent or sear_ent2 or sear_ent3 or sear_ent4:
+            for _sear_ent in [_sear for _sear in sear_list if _sear]:
+                # if sear_ent4:
+                #     ent_re = sear_ent4.group(2)
+                # elif sear_ent3:
+                #     ent_re = sear_ent3.group(2)
+                # elif sear_ent2:
+                #     ent_re = sear_ent2.group(2)
+                # else:
+                #     ent_re = sear_ent.group(1)
+                if _sear_ent==sear_ent4:
+                    ent_re = _sear_ent.group(2)
+                elif _sear_ent==sear_ent3:
+                    ent_re = _sear_ent.group(2)
+                elif _sear_ent==sear_ent2:
+                    ent_re = _sear_ent.group(2)
+                else:
+                    ent_re = _sear_ent.group(1)
+                # print('ent_re', ent_re)
+                ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
+
+                if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
+                                                  or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
+                    n = 0
+                    for i in range(len(ents) - 1, -1, -1):
+                        if not ents[i].in_attachment:
+                            n += 1
+                        if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
+                            break
+                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
+                            ents[i].label = 0
+                            ents[i].values[0] = 0.5
+                            tenderee_notfound = False
+                            # log('正则最后补充实体: %s'%(ent_re))
+                            break
+                elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
+                    n = 0
+                    for i in range(len(ents) - 1, -1, -1):
+                        if not ents[i].in_attachment:
+                            n += 1
+                        if n > 3 and _sear_ent==sear_ent:  # 文章末尾角色加日期这种只找后三个实体
+                            break
+                        if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
+                            ents[i].label = 1
+                            ents[i].values[1] = 0.5
+                            agency_notfound = False
+                            # log('正则最后补充实体: %s'%(ent_re))
+                            break
+                if not tenderee_notfound:
+                    break
 
         elif list_codenames[0]['name'] != "":  #把标题包含的公司实体作为招标人
-            tenderee_notfound = True
-            ents = []
-            for ent in list_entitys[0]:
-                if ent.entity_type in ['org', 'company']:
-                    if ent.label == 0:
-                        tenderee_notfound = False
-                    elif ent.label == 1:
-                        agency_notfound = False
-                    elif ent.label == 5:
-                        ents.append(ent)
+            # tenderee_notfound = True
+            # ents = []
+            # for ent in list_entitys[0]:
+            #     if ent.entity_type in ['org', 'company']:
+            #         if ent.label == 0:
+            #             tenderee_notfound = False
+            #         elif ent.label == 1:
+            #             agency_notfound = False
+            #         elif ent.label == 5:
+            #             ents.append(ent)
             if tenderee_notfound == True:
-                print('list_codenames',list_codenames[0]['name'])
+                # print('list_codenames',list_codenames[0]['name'])
                 for ent in ents:
                     if ent.entity_text in list_codenames[0]['name']:
                         ent.label = 0
@@ -1456,7 +1484,179 @@ class RoleRuleFinalAdd():
                         # log('正则召回标题中包含的实体:%s'%ent.entity_text)
                         break
 
-
+# 招标人角色召回规则
+class TendereeRuleRecall():
+    def __init__(self):
+        self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|"
+                                        "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::][^。;,]{,5}$")
+
+        self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|"
+                                        "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|"
+                                         "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|"
+                                         "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|"
+                                         "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|"
+                                         "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)")
+        self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)")
+        self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P<project>[^。;,?!::]+)")
+        # 公告主语判断规则
+        self.subject = re.compile("[我本][院校局]")
+        # 未识别实体召回正则
+        self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
+                                        "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
+                                        "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
+        self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
+                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
+                                "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
+        # 未识别实体尾部判断
+        self.unrecognized_end1 = re.compile(".{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓)")
+        self.unrecognized_end2 = re.compile(".{4,}(?:署|局|厅|处|室|科|部|站|所|股|行)")
+
+    def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
+        # tenderee_notfound = True
+        # agency_notfound = True
+        self.get_tenderee = False
+        ents = []
+        list_name = []
+        for ent in list_entitys[0]:
+            if ent.entity_type == 'name':
+                list_name.append(ent.entity_text)
+            if ent.entity_type in ['org', 'company']:
+                if ent.label == 0:
+                    # tenderee_notfound = False
+                    self.get_tenderee = True
+                # elif ent.label == 1:
+                #     agency_notfound = False
+                elif ent.label == 5:
+                    ents.append(ent)
+        if not self.get_tenderee:
+            self.entity_context_rule(ents,list_name,list_sentences)
+        if not self.get_tenderee:
+            self.subject_rule(ents,list_articles,list_sentences)
+        if not self.get_tenderee:
+            self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
+        if not self.get_tenderee:
+            self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
+
+    #entity上下文正则判断
+    def entity_context_rule(self,entitys,list_name,list_sentences):
+        for ent in entitys:
+            _sentence = list_sentences[0][ent.sentence_index]
+            _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
+                               end_index=ent.end_index, size=40, center_include=True,
+                               word_flag=True, use_text=True,
+                               text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
+            if re.search(self.tenderee_left,_span[0]):
+                ent.label = 0
+                ent.values[0] = 0.5 + ent.values[0] / 10
+                self.get_tenderee = True
+            elif re.search(self.tenderee_right,_span[2]):
+                ent.label = 0
+                ent.values[0] = 0.5 + ent.values[0] / 10
+                self.get_tenderee = True
+            elif re.search(self.tenderee_right2, _span[2]):
+                ent.label = 0
+                ent.values[0] = 0.5 + ent.values[0] / 10
+                self.get_tenderee = True
+            elif list_name:
+                pj_name = re.search(self.tenderee_right3, _span[2])
+                if pj_name:
+                    pj_name = pj_name.groupdict()["project"]
+                    for _name in list_name:
+                        if _name in pj_name:
+                            ent.label = 0
+                            ent.values[0] = 0.5
+                            self.get_tenderee = True
+                            break
+    # 公告主语判断
+    def subject_rule(self, entitys,list_articles,list_sentences):
+        content = list_articles[0].content.split('##attachment##')[0]
+        if re.search(self.subject,content):
+            _subject = re.search(self.subject,content).group()
+            for ent in entitys:
+                if re.search("院",_subject) and re.search("医院|学院",ent.entity_text):
+                    ent.label = 0
+                    ent.values[0] = 0.5 + ent.values[0] / 10
+                    self.get_tenderee = True
+                elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text):
+                    ent.label = 0
+                    ent.values[0] = 0.5 + ent.values[0] / 10
+                    self.get_tenderee = True
+                elif re.search("局", _subject) and re.search("局", ent.entity_text):
+                    _sentence = list_sentences[0][ent.sentence_index]
+                    _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
+                                       end_index=ent.end_index, size=20, center_include=True,
+                                       word_flag=True, use_text=True,
+                                       text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
+                    if not re.search("监督|投诉",_span[0][-10:]):
+                        ent.label = 0
+                        ent.values[0] = 0.5 + ent.values[0] / 10
+                        self.get_tenderee = True
+
+    # 正则召回未识别实体
+    def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
+        list_sentence = list_sentences[0]
+        for in_attachment in [False,True]:
+            for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
+                sentence_text = sentence.sentence_text
+                tokens = sentence.tokens
+                doc_id = sentence.doc_id
+                in_attachment = sentence.in_attachment
+                list_tokenbegin = []
+                begin = 0
+                for i in range(0, len(tokens)):
+                    list_tokenbegin.append(begin)
+                    begin += len(str(tokens[i]))
+                list_tokenbegin.append(begin + 1)
+                for _match in re.finditer(pattern,sentence_text):
+                    _groupdict = _match.groupdict()
+                    _match_text = _match.group()
+                    _unrecognized_text = _groupdict["unrecognized"]
+                    # print(_unrecognized_text)
+                    # if _match_text[-1] in [':',':']:
+                    #     _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
+                    #     if not _unrecognized:
+                    #         _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
+                    #     if _unrecognized:
+                    #         _unrecognized = _unrecognized.group()
+                    #     else:
+                    #         continue
+                    # else:
+                    #     _unrecognized = _unrecognized_text
+                    _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
+                    if not _unrecognized:
+                        _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
+                    if _unrecognized:
+                        _unrecognized = _unrecognized.group()
+                    else:
+                        continue
+                    # print(_unrecognized)
+                    if re.search("某",_unrecognized):
+                        continue
+                    begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
+                    for j in range(len(list_tokenbegin)):
+                        if list_tokenbegin[j] == begin_index_temp:
+                            begin_index = j
+                            break
+                        elif list_tokenbegin[j] > begin_index_temp:
+                            begin_index = j - 1
+                            break
+                    index = begin_index_temp + len(_unrecognized)
+                    end_index_temp = index
+                    for j in range(begin_index, len(list_tokenbegin)):
+                        if list_tokenbegin[j] >= index:
+                            end_index = j - 1
+                            break
+                    entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
+                    entity_text = _unrecognized
+                    new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
+                               begin_index_temp, end_index_temp, in_attachment=in_attachment)
+                    new_entity.label = 0
+                    new_entity.values = [on_value,0,0,0,0,0]
+                    list_entitys[0].append(new_entity)
+                    self.get_tenderee = True
+            if self.get_tenderee:
+                list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
+                break
 
 # 时间类别
 class TimePredictor():

+ 1 - 1
BiddingKG/dl/ratio/re_ratio.py

@@ -1,7 +1,7 @@
 import re
 
 # ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
-ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)费率|折扣率)([((]?%[))]?|)[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
+ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
 # ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)费率|折扣率)([((]?%[))]?|)[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)')
 
 # 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%,

+ 6 - 3
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -92,7 +92,8 @@ def predict(doc_id,text):
     #             print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
     #             pass
     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys, codeName)
+    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_sentences,list_entitys, codeName)
+    predictor.getPredictor("tendereeRuleRecall").predict(list_articles,list_sentences,list_entitys, codeName)
     # print("epcPredict")
     epcPredict.predict(list_sentences,list_entitys)
 
@@ -143,11 +144,13 @@ def predict(doc_id,text):
                 if entity.pointer_person:
                     print("公司->联系人1:",end=' ')
                     print(entity.entity_text,[i.entity_text for i in entity.pointer_person],entity.label,entity.values)
+                    # print(_sentence.tokens[entity.begin_index:entity.end_index+3])
                     # print(entity.entity_text,entity.label,entity.values)
                     # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
                 else:
                     print("公司->联系人2:", end=' ')
                     print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
+                    print(_sentence.tokens[entity.begin_index:entity.end_index+3])
                     # print(_sentence.sentence_text,_sentence.tokens[entity.begin_index:entity.end_index+1])
                     pass
                 if entity.label in [2,3,4]:
@@ -166,7 +169,7 @@ def predict(doc_id,text):
             #     if entity.pointer_pack:
             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
             # elif entity.entity_type =='money':
-            #     print('money',entity.entity_text,entity.label)
+            #     print('money',entity.entity_text,entity.label,entity.money_unit,entity.notes)
             # elif entity.entity_type =='phone':
             #     print('phone',entity.entity_text)
             # elif entity.entity_type =='name':
@@ -177,7 +180,7 @@ def predict(doc_id,text):
 
     #print(prem)
     # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
-    return json.dumps(prem[0],cls=MyEncoder,sort_keys=True,indent=2,ensure_ascii=False)
+    return json.dumps(prem[0],cls=MyEncoder,sort_keys=True,indent=1,ensure_ascii=False)
 
          
 # def test(name,content):