2 năm trước cách đây · 8724d4e57b
--- a/BiddingKG/dl/bidway/re_bidway.py
+++ b/BiddingKG/dl/bidway/re_bidway.py
@@ -200,6 +200,7 @@ import re
 
															 #
														
 
															 #     return output_list[0], text_index_list[0]
														
 
															+normal_bidway = "公开招标|邀请招标|竞争性谈判|竞争性磋商|单一来源|框架协议|询价"
														
 
															 bidway = '单一来源' \
														
 
															          '|国内竞争性磋商|竞争性磋商|竞争性谈判|网络竞价|网上竞价|公开竞谈|公开竞价|电子竞价|竞价|竞标|竞谈竞价|电子书面竞投' \
														
@@ -210,17 +211,17 @@ bidway = '单一来源' \
 
															          '|网上询价|公开询价|非定向询价|定向询价|询比价|询单|询价|询比' \
														
 
															          '|库内邀请|库内公开发包|内部邀标' \
														
 
															          '|定点采购议价|定点采购' \
														
 
															-         '|竞争性评审'
														
 
															+         '|竞争性评审|框架协议'
														
 
															 not_bidway = '及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录|自由竞价' \
														
 
															              '|限时竞价|咨询单位|询价单'
														
 
															-not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除"
														
 
															+not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除|可以选择|包括|涉及|非"
														
 
															 not_bidway_suffix = "文件|报名|邀请|项目|失败|数量|编号|后|时间|类型|名称|和|成交" \
														
 
															                     "|标题|开始|结束|产品|报价|供应商|部门|监督|需求|范围|入围|内容|人" \
														
 
															                     "|条件|公司|保证金|完毕|事件|成功|活动|地点|标|会|须知|范围" \
														
 
															-                    "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败"
														
 
															+                    "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败|小组"
														
 
															 bidway_preffix = '采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式' \
														
 
															                  '|发包方式|发包类型|开展方式|招标类型|选取方式|招租方式'
														
@@ -268,21 +269,64 @@ def re_standard_bidway(_str):
 
															     bidway_list = []
														
 
															     if match:
														
 
															         for m in match:
														
 
															-            m_dict = m.groupdict()
														
 
															-            m_span = m.span()
														
 
															-            keyword = ""
														
 
															-            keyword_index = [m_span[0], m_span[1]]
														
 
															-            for key in m_dict.keys():
														
 
															-                if key == "value":
														
 
															-                    keyword = m_dict.get(key)
														
 
															-                else:
														
 
															-                    keyword_index[0] += len(m_dict.get(key))
														
 
															+            keyword = m.group('value')
														
 
															+            keyword_index = list(m.span('value'))
														
 
															+            behind_str = _str[m.start(): m.end()+30]
														
 
															+            if len(re.findall(normal_bidway, behind_str))>1:
														
 
															+                keyword = ''
														
 
															+                for it in re.finditer('(?P<sign>.{1,2})(?P<bidway>'+normal_bidway+')+', behind_str): # 招标方式后面多个选择处理
														
 
															+                    if '□' != it.group('sign')[-1]:
														
 
															+                        keyword = it.group('bidway')
														
 
															+                        keyword_index = [m.start()+it.start('bidway'), m.start()+it.end('bidway')]
														
 
															+                        break
														
 
															+             # m_dict = m.groupdict()
														
 
															+            # m_span = m.span()
														
 
															+            # keyword = ""
														
 
															+            # keyword_index = [m_span[0], m_span[1]]
														
 
															+            # for key in m_dict.keys():
														
 
															+            #     if key == "value":
														
 
															+            #         keyword = m_dict.get(key)
														
 
															+            #     else:
														
 
															+            #         keyword_index[0] += len(m_dict.get(key))
														
 
															             bidway_list.append([keyword, keyword_index])
														
 
															     return bidway_list
														
 
															+def re_normal_bidway(_str):
														
 
															+    ser = re.search("("+normal_bidway+")(转为?|变更为|更改为)"+"(?P<bidway>(" + normal_bidway + "))", _str) # 如果方式变更取变更后的
														
 
															+    if ser:
														
 
															+        return [[ser.group('bidway'), list(ser.span('bidway'))]]
														
 
															+    reg_all = "(?P<value>" + normal_bidway + ")"
														
 
															+    match = re.finditer(reg_all, _str)
														
 
															+    bidway_list = []
														
 
															+    bidway_set = set()
														
 
															+    if match:
														
 
															+        for m in match:
														
 
															+            keyword = m.group()
														
 
															+            if keyword == '公开招标' and m.start()>0 and _str[m.start()-1]=='非':
														
 
															+                continue
														
 
															+            keyword_index = list(m.span())
														
 
															+            bidway_set.add(keyword)
														
 
															+            bidway_list.append([keyword, keyword_index])
														
 
															+    if len(bidway_list) == 0: # 如果找不到标准方式，匹配简称方式
														
 
															+        ser = re.search('(?P<bidway>(磋商|谈判))(公告|成交|结果)', _str)
														
 
															+        if ser:
														
 
															+            return [[ser.group('bidway'), list(ser.span('bidway'))]]
														
 
															+    if len(bidway_set) > 1: # 匹配到多种招标方式返回空
														
 
															+        return []
														
 
															+    return bidway_list
														
 
															 def re_all_bidway(_str):
														
 
															+    reg_all = "(?P<value>" + normal_bidway + ")" # 优先匹配规范的招标方式
														
 
															+    match = re.finditer(reg_all, _str)
														
 
															+    bidway_list = []
														
 
															+    if match:
														
 
															+        for m in match:
														
 
															+            keyword = m.group()
														
 
															+            keyword_index = list(m.span())
														
 
															+            bidway_list.append([keyword, keyword_index])
														
 
															+    return bidway_list
														
 
															+
														
 
															     reg_all = "(?P<value>" + bidway + ")"
														
 
															     match = re.finditer(reg_all, _str)
														
 
															     bidway_list = []
														
@@ -339,6 +383,13 @@ def get_one_word(bidway_list):
 
															 def re_bidway(text, title):
														
 
															+    # 优先匹配标题标准招标方式
														
 
															+    if len(title)<100:
														
 
															+        bidway_list = re_normal_bidway(title)
														
 
															+        if bidway_list:
														
 
															+            word, text_index = get_one_word(bidway_list)
														
 
															+            return word, text_index
														
 
															+
														
 
															     # 替换易混淆词
														
 
															     text_clean = re_not_bidway(text)
														
 
															     title_clean = re_not_bidway(title)
														
@@ -406,12 +457,30 @@ bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
 
															                '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
														
 
															                '竞争性磋商': '竞争性磋商', '采购方式：邀请': '邀请招标',
														
 
															                '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
														
 
															-               '网上询价': '询价'}
														
 
															+               '网上询价': '询价', '框架协议': '框架协议', '谈判':'竞争性谈判'}
														
 
															 # bidway名称统一规范
														
 
															 def bidway_integrate(bidway):
														
 
															     integrate_name = bidway_dict.get(bidway,"其他")
														
 
															     return integrate_name
														
 
															+def bidway_normalize(key):
														
 
															+    if re.search('公开招标|公开发包', key):
														
 
															+        return '公开招标'
														
 
															+    elif re.search('单一来源', key):
														
 
															+        return '单一来源'
														
 
															+    elif re.search('磋商', key):
														
 
															+        return '竞争性磋商'
														
 
															+    elif re.search('谈判', key):
														
 
															+        return '竞争性谈判'
														
 
															+    elif re.search('竞谈|竞价|竞投|竞标', key):
														
 
															+        return '竞价'
														
 
															+    elif re.search('询价|询比|比价|询单', key):
														
 
															+        return '询价'
														
 
															+    elif re.search('邀请|邀标', key):
														
 
															+        return '邀请招标'
														
 
															+    else:
														
 
															+        return bidway_dict.get(key, '其他')
														
 
															+
														
 
															 def test_csv():
														
 
															     df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")
														
@@ -456,13 +525,90 @@ def test_str():
 
															 def test_html():
														
 
															-    html_path = "C:/Users/Administrator/Desktop/3.html"
														
 
															+    # html_path = "C:/Users/Administrator/Desktop/3.html"
														
 
															+    html_path = 'd:/html/2.html'
														
 
															-    with open(html_path, "r") as f:
														
 
															+    with open(html_path, "r", encoding='utf-8') as f:
														
 
															         s = f.read()
														
 
															     print(extract_bidway(s, title=""))
														
 
															+def get_valuate():
														
 
															+    import psycopg2
														
 
															+    conn = psycopg2.connect(host='192.168.2.103', port='5432', user='postgres', password='postgres', dbname='iepy')
														
 
															+    cursor = conn.cursor()
														
 
															+    sql = "select c1.docid, c1.doctitle, c1.extract_json, c2.text from corpus_otherinput c1 left join corpus_iedocument c2 on c1.docid=c2.human_identifier where c1.new_extract notnull;" # where docid='110635873'
														
 
															+    # sql = "select c1.docid, c1.doctitle from corpus_otherinput c1;"
														
 
															+    # sql = "select text from corpus_iedocument limit 50000;"
														
 
															+    cursor.execute(sql)
														
 
															+    datas = []
														
 
															+    olds = []
														
 
															+    news = []
														
 
															+    label_old = []
														
 
															+    label_new = []
														
 
															+    labels = []
														
 
															+    for row in cursor.fetchall():
														
 
															+        docid = row[0]
														
 
															+        doctitle = row[1]
														
 
															+        ex = row[2]
														
 
															+        text = row[3]
														
 
															+        ser = re.search('"bidway": "(\w{,6})"', ex)
														
 
															+        # print('ser:', ser)
														
 
															+        old = ser.group(1) if ser else ""
														
 
															+        pred = extract_bidway(text, title=doctitle)
														
 
															+
														
 
															+        # list_bidway = extract_bidway(text, title=doctitle)
														
 
															+        # print('list_bidway', list_bidway)
														
 
															+        # if list_bidway:
														
 
															+        #     bidway = list_bidway[0].get("body")
														
 
															+        #     # bidway名称统一规范
														
 
															+        #     bidway = bidway_integrate(bidway)
														
 
															+        # else:
														
 
															+        #     bidway = ""
														
 
															+        # print('bidway: ', bidway)
														
 
															+
														
 
															+        pred = pred[0]['body'] if len(pred) > 0 else ""
														
 
															+        new = bidway_dict.get(pred, "其他") if pred!="" else ""
														
 
															+        sql2 = "select value from brat_bratannotation where document_id='{0}' and value like '%bidway%' limit 4;".format(docid)
														
 
															+        cursor.execute(sql2)
														
 
															+        lb_new = docid + "_"
														
 
															+        lb_old = docid + "_"
														
 
															+        tmp_l = []
														
 
															+        for row in cursor.fetchall():
														
 
															+            lb = row[0].split()[-1]
														
 
															+            lb = bidway_dict.get(lb, "其他")  # 新准确率：0.9642, 召回率： 0.9642, F1: 0.8965
														
 
															+            # lb = bidway_normalize(lb)   # 旧准确率：0.9287, 召回率： 0.9287, F1: 0.8011  新准确率：0.9692, 召回率： 0.9692, F1: 0.9105
														
 
															+
														
 
															+            tmp_l.append(lb)
														
 
															+            if lb == new:
														
 
															+                lb_new = docid + "_" + lb
														
 
															+            if lb == old:
														
 
															+                lb_old = docid + "_" + lb
														
 
															+        olds.append(docid + "_" + old)
														
 
															+        news.append(docid + "_" + new)
														
 
															+        label_new.append(lb_new)
														
 
															+        label_old.append(lb_old)
														
 
															+        labels.append('；'.join(tmp_l))
														
 
															+        datas.append((docid, docid + "_" + old, lb_old, docid + "_" + new, lb_new, '；'.join(tmp_l)))
														
 
															+
														
 
															+    eq_old = len(set(olds)&set(label_old))
														
 
															+    eq_new = len(set(news)&set(label_new))
														
 
															+
														
 
															+    acc_old = eq_old/len(set(olds))
														
 
															+    recall_old = eq_old/len(set(label_old))
														
 
															+    f1_old = acc_old*recall_old/2*(acc_old+recall_old)
														
 
															+
														
 
															+    acc_new = eq_new/len(set(news))
														
 
															+    recall_new = eq_new/len(set(label_new))
														
 
															+    f1_new = acc_new*recall_new/2*(acc_new+recall_new)
														
 
															+    print('旧准确率：%.4f, 召回率： %.4f, F1: %.4f'%(acc_old, recall_old, f1_old))
														
 
															+    print('新准确率：%.4f, 召回率： %.4f, F1: %.4f'%(acc_new, recall_new, f1_new))
														
 
															+
														
 
															+
														
 
															+    df = pd.DataFrame(datas, columns=['docid', 'pred_old', 'label_old', 'pred_new', 'label_new', 'labels'])
														
 
															+    df['old_pos'] = df.apply(lambda x:1 if x['pred_old']==x['label_old'] else 0, axis=1)
														
 
															+    df['new_pos'] = df.apply(lambda x:1 if x['pred_new']==x['label_new'] else 0, axis=1)
														
 
															+    df.to_csv('E:/其他数据/招标方式预测结果.csv', index=False)
														
 
															 if __name__ == "__main__":
														
 
															     # extract_bidway(s)
														
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -277,8 +277,8 @@ class CodeNamePredict():
 
															     def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
														
 
															         #@summary: 获取每篇文章的code和name
														
 
															-        pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
														
 
															-
														
 
															+        # pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
														
 
															+        pattern_score = re.compile('建设项目|服务项目|工程项目|工程施工|建设工程|服务中心|基础设施|物业管理|工程设计|妇幼保健|咨询服务|管理系统|管理中心|改建工程|配套工程|公安局|幼儿园|管理局|使用权|办公楼|教育局|管理处|图书馆|经营权|项目|采购|工程|改造|服务|设备|中心|医院|系统|建设|监理|施工|维修|学院|安装|设计|关于|标段|招标|技术|询价|管理|学校|小学|中学|平台|提升|设施|检测|整治|社区|装修|政府|绿化|物资|租赁|地块|医疗|编制|公开|规划|监控|教育|维护|校区|治理|升级|安置|竞价|购置|评估|勘察|承包|实验|大学|材料|生产|耗材|招租|硬化|维保|用地|消防|审计|拍卖|物业|入围|养护|机关|企业|用房|出让|资产|分局|验收|宣传|处置|校园|研究|咨询|修缮|更换|装饰|劳务|保养|物流|出租|局|院')
														
 
															         result = []
														
 
															         index_unk = self.word2index.get("<unk>")
														
 
															         # index_pad = self.word2index.get("<pad>")
														
@@ -393,20 +393,40 @@ class CodeNamePredict():
 
															                                 #add code to entitys
														
 
															                                 list_entity.append(temp_entitys[h])
														
 
															-
														
 
															-                                if the_code not in code_set:
														
 
															+                                if re.search(',|/|;|、|，', the_code) and len(the_code)>25:
														
 
															+                                    for it in re.split(',|/|;|、|，', the_code):
														
 
															+                                        if len(it) > 8:
														
 
															+                                            if it not in code_set:
														
 
															+                                                code_set.add(it)
														
 
															+                                                item['code'].append(it)
														
 
															+                                        elif len(item['code']) > 0:
														
 
															+                                            new_it = item['code'][-1] + re.search(',|/|;|、|，', the_code).group(0) + it
														
 
															+                                            if new_it not in code_set:
														
 
															+                                                code_set.add(new_it)
														
 
															+                                                item['code'][-1] = new_it
														
 
															+                                        else:
														
 
															+                                            if the_code not in code_set:
														
 
															+                                                code_set.add(the_code)
														
 
															+                                                item['code'].append(the_code)
														
 
															+                                            break
														
 
															+                                elif the_code not in code_set:
														
 
															                                     code_set.add(the_code)
														
 
															-                                    item['code'] = list(code_set)
														
 
															+                                    item['code'].append(the_code)
														
 
															+
														
 
															+                                # if the_code not in code_set:
														
 
															+                                #     code_set.add(the_code)
														
 
															+                                #     item['code'] = list(code_set)
														
 
															                     for iter in re.finditer(self.PN_pattern,join_predict):
														
 
															                         _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
														
 
															                         #add name to entitys
														
 
															                         _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
														
 
															                         list_entity.append(_entity)
														
 
															-                        w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[:：\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
														
 
															+                        # w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[:：\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
														
 
															+                        w = 1 if re.search('(项目|工程|招标|采购|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题|项目)[:：\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
														
 
															                         if _name not in dict_name_freq_score:
														
 
															                             # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
														
 
															-                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
														
 
															+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w+(5-sentence.sentence_index)*0.2]
														
 
															                         else:
														
 
															                             dict_name_freq_score[_name][0] += 1
														
 
															                     '''
														
@@ -423,18 +443,21 @@ class CodeNamePredict():
 
															                 _begin_index += _LEN
														
 
															             list_name_freq_score = []
														
 
															+            # print('模型预测项目名称：', dict_name_freq_score)
														
 
															             # 2020/11/23 大网站规则调整
														
 
															             if len(dict_name_freq_score) == 0:
														
 
															-                name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[:：\s]+([^，。：；]{2,60})[，。]'
														
 
															+                # name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[:：\s]+([^，。：；]{2,60})[，。]'
														
 
															+                name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[:：\s]+(?P<name>[^，。：；]{2,60})[，。]'
														
 
															                 for sentence in list_sentence:
														
 
															                     # pad_sentence = sentence.sentence_text
														
 
															                     othername = re.search(name_re1, sentence.sentence_text)
														
 
															                     if othername != None:
														
 
															-                        project_name = othername.group(3)
														
 
															+                        project_name = othername.group('name')
														
 
															                         beg = find_index([project_name], sentence.sentence_text)[0]
														
 
															                         end = beg + len(project_name)
														
 
															                         _name = self.fitDataByRule(sentence.sentence_text[beg:end])
														
 
															+                        # print('规则召回项目名称：', _name)
														
 
															                         # add name to entitys
														
 
															                         _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
														
 
															                         sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
														
@@ -444,7 +467,7 @@ class CodeNamePredict():
 
															                         w = 1
														
 
															                         if _name not in dict_name_freq_score:
														
 
															                             # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
														
 
															-                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
														
 
															+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w+(5-sentence.sentence_index)*0.2]
														
 
															                         else:
														
 
															                             dict_name_freq_score[_name][0] += 1
														
 
															                 # othername = re.search(name_re1, sentence.sentence_text)
														
@@ -461,6 +484,8 @@ class CodeNamePredict():
 
															             if len(list_name_freq_score)>0:
														
 
															                 list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
														
 
															                 item['name'] = list_name_freq_score[0][0]
														
 
															+                # for it in list_name_freq_score:
														
 
															+                    # print('项目名称及分值：',it[0],it[1], it[1][0]*it[1][1])
														
 
															                 # if list_name_freq_score[0][1][0]>1:
														
 
															                 #     item[1]['name'] = list_name_freq_score[0][0]
														
 
															                 # else:
														
@@ -474,9 +499,10 @@ class CodeNamePredict():
 
															                     # if othercode != None:
														
 
															                     #     item[1]['code'].append(othercode.group(2))
														
 
															                     # 2020/11/23 大网站规则调整
														
 
															-                    othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[:：\s]+([^，。；：、]{8,30}[a-zA-Z0-9\号])[\)，。]', sentence.sentence_text)
														
 
															+                    othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[:：\s]+(?P<code>[^，。；：、]{8,30}[a-zA-Z0-9\号])[\)，。\u4e00-\u9fa5]', sentence.sentence_text)
														
 
															                     if othercode != None:
														
 
															-                        item['code'].append(othercode.group(3))
														
 
															+                        item['code'].append(othercode.group('code'))
														
 
															+                        # print('规则召回项目编号：', othercode.group('code'))
														
 
															             item['code'] = [code for code in item['code'] if len(code)<500]
														
 
															             item['code'].sort(key=lambda x:len(x),reverse=True)
														
 
															             result.append(item)
														
@@ -692,6 +718,7 @@ class PREMPredict():
 
															             text_tup = text_list[i]
														
 
															             front, middle, behind = text_tup
														
 
															             whole = "".join(text_tup)
														
 
															+            # print('模型预测角色：', front, entity.entity_text, label, values)
														
 
															             if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他，让后面的规则召回重新判断
														
 
															                 label = 5
														
 
															             elif label in [2,3,4] and re.search('序号：\d+，\w{,2}候选', front):
														
@@ -770,6 +797,7 @@ class PREMPredict():
 
															             whole = "".join(text_tup)
														
 
															             # print('金额： ', entity.entity_text, label, values, front, middle, behind)
														
 
															             if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额，让后面的规则召回重新判断
														
 
															+                # print('模型预测金额： ', entity.entity_text, label, values, front, middle, behind)
														
 
															                 label = 2
														
 
															             elif label == 1: # 错误中标金额处理
														
 
															                 if re.search('[:：，。](总金额|总价|单价)(（万?元）)?：?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
														
@@ -1435,6 +1463,7 @@ class RoleRulePredictor():
 
															                                         self.pattern_money_other, _span[0]) is None:
														
 
															                                     p_entity.values[0] = 0.8 + p_entity.values[0] / 10
														
 
															                                     p_entity.label = 0
														
 
															+                                    # print('规则召回预算金额：', p_entity.entity_text, _span[0])
														
 
															                                 if re.search(self.pattern_money_tenderer, _span[0]) is not None:
														
 
															                                     if re.search(self.pattern_money_other, _span[0]) is not None:
														
 
															                                         if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
														
@@ -1453,6 +1482,7 @@ class RoleRulePredictor():
 
															                                         , _sentence.sentence_text[:p_entity.wordOffset_begin]): # 处理几个标段金额相邻情况 例子：191705231
														
 
															                                     p_entity.values[0] = 0.8 + p_entity.values[0] / 10
														
 
															                                     p_entity.label = 0
														
 
															+                                    # print('规则召回预算金额2：', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
														
 
															             # 增加招标金额扩展，招标金额+连续的未识别金额，并且都可以匹配到标段信息，则将为识别的金额设置为招标金额
														
 
															             list_p = []