Pārlūkot izejas kodu

优化角色提取,修复分词错误造成角色前后文输入错误;

lsm 1 gadu atpakaļ
vecāks
revīzija
d2322317e0

+ 20 - 0
BiddingKG/dl/common/Utils.py

@@ -572,6 +572,26 @@ def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag
     #print(result)
     return result
 
+def get_context(sentence_text, begin_index, end_index, size=20, center_include=False):
+    '''
+    返回实体上下文信息
+    :param sentence_text: 句子文本
+    :param begin_index: 实体字开始位置
+    :param end_index: 实体字结束位置
+    :param size: 字偏移量
+    :param center_include:
+    :return:
+    '''
+    result = []
+    begin = begin_index - size if begin_index>size else 0
+    end = end_index + size
+    result.append(sentence_text[begin: begin_index])
+    if center_include:
+        result.append(sentence_text[begin_index: end_index])
+    result.append(sentence_text[end_index: end])
+    return result
+
+
 #根据规则补全编号或名称两边的符号
 def fitDataByRule(data):
     symbol_dict = {"(":")",

+ 2 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -2211,11 +2211,11 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub(',最高有效报价者:', ',中标人名称:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         article_processed = re.sub(',最高有效报价:', ',投标报价:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
-        ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
+        ser = re.search('(采购|招标|比选)人(名称)?/(采购|招标|比选)?代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
         if ser:
             article_processed = article_processed.replace(ser.group(0), '采购人名称:%s,采购代理机构名称:%s,' % (ser.group('tenderee'), ser.group('agency')))
 
-        ser2 = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25})[,。]', article_processed)
+        ser2 = re.search('(采购|招标)人(名称)?/(采购|招标)?代理机构(名称)?:(?P<tenderee>[\w()]{4,25})[,。/]', article_processed)
         if ser2:
             article_processed = article_processed.replace(ser2.group(0), '采购人名称:%s,采购代理机构名称:,' % (
             ser2.group('tenderee')))

+ 18 - 1
BiddingKG/dl/interface/extract.py

@@ -196,6 +196,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     log("get prem done of doc_id%s"%(doc_id))
     cost_time["prem"] = round(time.time()-start_time,2)
 
+    # roles_l = get_role_context(doc_id, list_sentences, list_entitys)
+    # return roles_l
+
     # start_time = time.time() # 产品名称及废标原因提取  此处作废 换到后面预测 2022/4/29
     # fail = channel_dic['docchannel']['docchannel'] == "废标公告"
     # fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
@@ -327,7 +330,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-07-18'}
+    version_date = {'version_date': '2023-09-04'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''
@@ -380,6 +383,20 @@ def get_ent_context(list_sentences, list_entitys):
                 rs_list.append("%s %d %.4f; %s ## %s ## %s"%(_entity.entity_type, _entity.label, _entity.values[_entity.label], s[max(0, b-10):b], _entity.entity_text, s[e:e+10]))
     return '\n'.join(rs_list)
 
+def get_role_context(docid, list_sentences, list_entitys):
+    rs_list = []
+    sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
+    for list_entity in list_entitys:
+        for _entity in list_entity:
+            if _entity.entity_type in ['org', 'company']:
+                sentence = sentences[_entity.sentence_index]
+                # _span = spanWindow(tokens=sentence.tokens, begin_index=_entity.begin_index, end_index=_entity.end_index, size=20,
+                #                    center_include=False, word_flag=True, text=_entity.entity_text)
+                _span = get_context(sentence.sentence_text, _entity.wordOffset_begin, _entity.wordOffset_end, size=20, center_include=False)
+                rs_list.append((docid, _entity.entity_type, _entity.label, '%.4f'%_entity.values[_entity.label], _span[0],
+                _entity.entity_text, _span[1]))
+    return rs_list
+
 if __name__=="__main__":
     import pandas as pd
     t1 = time.time()

+ 16 - 0
BiddingKG/dl/interface/modelFactory.py

@@ -80,6 +80,22 @@ class Model_role_classify_word():
         _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False) #  word_len=20
         # print(_encode_span)
         return _encode_span
+
+    def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):
+        '''
+        上下文数字化,使用字偏移
+        :param sentence_text: 句子文本
+        :param begin_index: 实体字开始位置
+        :param end_index: 实体字结束位置
+        :param size: 字偏移量
+        :param kwargs:
+        :return:
+        '''
+        _span = get_context(sentence_text, begin_index, end_index,size=size, center_include=False)  # size=12 center_include=True
+        # print(_span)
+        _encode_span = encodeInput(_span, word_len=20, word_flag=True, userFool=False)  # word_len=20
+        # print(_encode_span)
+        return _encode_span
     
     def predict(self,x):
         x = np.transpose(np.array(x),(1,0,2))

Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 112 - 85
BiddingKG/dl/interface/predictor.py


Daži faili netika attēloti, jo izmaiņu fails ir pārāk liels