Parcourir la source

修复项目需求预算要素提取中的时间格式化问题;优化角色正则;优化金额正则;监理勘察设计项目对应金额处理

lishimin il y a 3 ans
Parent
commit
607132a794

+ 14 - 10
BiddingKG/dl/interface/Preprocessing.py

@@ -1713,7 +1713,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word":"((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|标的基本情况|CNY|成交结果:)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号]{,8}?))(第[123一二三]名[::])?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台只吨斤棵株页亩方条米]*))\s*[)\)]?))",
+                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果:)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?元?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?元?(?P<filter_unit1>[台只吨斤棵株页亩方条米]*))\s*[)\)]?))",
                                   "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?元)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
                                   "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千]*)[\((]?(?P<unit_behind_m>[万亿]?元(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
             # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
@@ -1818,7 +1818,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                                 filter_unit = True
 
                     if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
-                        if re.search('[幢栋号楼层]', sentence_text[_match.span()[0]-2:_match.span()[0]]):
+                        if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0]-2):_match.span()[0]]):
                             entity_text = re.sub('\d+,', '', entity_text)
                         else:
                             entity_text = entity_text.replace(',', '.')
@@ -1830,7 +1830,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         if ('¥' in text_beforeMoney or '¥' in text_beforeMoney):
                             unit = '元'
                             # print('明显金额特征补充单位 元')
-                        elif re.search('[单报标限]价|金额|价格[::]+$', text_beforeMoney.strip()) and \
+                        elif re.search('[单报标限]价|金额|价格|(监理|设计|勘察)(服务)?费[::]+$', text_beforeMoney.strip()) and \
                                 re.search('\d{5,}',entity_text) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}',entity_text)==None:
                             unit = '元'
                             # print('明显金额特征补充单位 元')
@@ -1868,19 +1868,23 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
 
                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
-                    # print('转换前金额:', entity_text, '单位:', unit)
-                    if re.search('总投资', sentence_text[_match.span()[0] - 6:_match.span()[0]]):  # 2021/8/5过滤掉总投资金额
+                    # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
+                    if re.search('总投资|投资总额|总预算|总概算|投资规模', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
                         # print('总投资金额: ', _match.group(0))
                         notes = '总投资'
-                    if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
-                        notes = '大写'
-                        # print("补充备注:notes = 大写")
+                    elif re.search('投资', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
+                        notes = '投资'
+                    elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]', sentence_text[_match.span()[0]:_match.span()[1]]):
+                        cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
+                        notes = cost_re.group(1)
                     elif re.search('单价', sentence_text[_match.span()[0]:_match.span()[1]]):
                         notes = '单价'
-                        # print("补充备注:单价 ",sentence_text[_match.span()[0]-2:_match.span()[1]])
+                    elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
+                        notes = '大写'
+                        # print("补充备注:notes = 大写")
                     if len(unit)>0:
                         if unit.find('万')>=0 and len(entity_text.split('.')[0])>=8: # 2021/7/19 修正万元金额过大的情况
-                            print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
+                            # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
                             entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(unit[0])/10000)
                             unit = '元' # 修正金额后单位 重置为元
                         else:

+ 49 - 31
BiddingKG/dl/interface/extract.py

@@ -42,7 +42,6 @@ class MyEncoder(json.JSONEncoder):
         return json.JSONEncoder.default(self, obj)
 
 def predict(doc_id,text,title="",page_time="",**kwargs):
-
     cost_time = dict()
 
     start_time = time.time()
@@ -52,6 +51,11 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time.update(_cost_time)
 
+    #依赖句子顺序
+    start_time = time.time()
+    list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
+    cost_time["channel"] = round(time.time()-start_time,2)
+
     start_time = time.time()
     codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
     log("get codename done of doc_id%s"%(doc_id))
@@ -86,6 +90,20 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     log("get time done of doc_id%s"%(doc_id))
     cost_time["time"] = round(time.time()-start_time,2)
 
+    # 需在getPredictor("prem")后  getAttributes.getPREMs 前
+    if len(re.findall('监理|施工|设计|勘察', title))==1 and re.search('施工|总承包|epc|EPC',title)==None:
+        keyword = re.search('监理|设计|勘察', title).group(0)
+        for list_entity in list_entitys:
+            for _entity in list_entity:
+                # print('keyword:',keyword, '_entity.notes :',_entity.notes)
+                if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label==2:
+                    if list_channel_dic[0]['docchannel'] == "招标公告":
+                        _entity.values[0] = 0.51
+                        _entity.set_Money(0, _entity.values)  #2021/11/18 根据公告类别把费用改为招标或中投标金额
+                    else:
+                        _entity.values[1] = 0.51
+                        _entity.set_Money(1, _entity.values)
+
     #依赖句子顺序
     start_time = time.time()
     entityLink.link_entitys(list_entitys)
@@ -93,11 +111,6 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
-    #依赖句子顺序
-    start_time = time.time()
-    list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
-    cost_time["channel"] = round(time.time()-start_time,2)
-
     start_time = time.time()
     list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
     cost_time["punish"] = round(time.time()-start_time,2)
@@ -115,7 +128,6 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     data_res["cost_time"] = cost_time
     data_res["success"] = True
 
-    #
     # for _article in list_articles:
     #     log(_article.content)
     #
@@ -143,34 +155,40 @@ def test(name,content):
 if __name__=="__main__":
     import pandas as pd
     t1 = time.time()
-    text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
-    title = '合同公告'
-    df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
-    # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
-    for i in range(30,50,1):
-        text = df.loc[i, 'dochtmlcon']
-        rs = json.loads(predict('', text, ''))
-        print(rs['demand_info'])
-        print(rs['product'])
-        print(rs['product_attrs'])
-    print(rs)
-
-    # with open('D:/138786703.html', 'r', encoding='utf-8') as f:
-    #     text = f.read()
-    #     print(predict('', text, title))
-
+    # text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
+    title = '打印机'
+    # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
+    # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
+    # for i in range(30,50,1):
+    #     text = df.loc[i, 'dochtmlcon']
+    #     rs = json.loads(predict('', text, ''))
+    #     print(rs['demand_info'])
+    #     print(rs['product'])
+    #     print(rs['product_attrs'])
+    # print(rs)
+
+    with open('D:/html/138786703.html', 'r', encoding='utf-8') as f:
+        text = f.read()
+        print(predict('', text, title))
     # print(predict('',text,title))
-    # df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx')[:20]
+
+    # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]
+    # df = pd.read_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115_2.xlsx')[:]
     # new_prem = []
     # for i in range(len(df)):
-    # # i = 246
+    #     i = 530
     #     doc_id = df.loc[i, 'docid']
-    #     text = df.loc[i, 'dochtmlcon']
-    #     title = df.loc[i, 'doctitle']
-    #     rs = predict(doc_id,text,title)
+    #     text = df.loc[i, 'html']
+    #     # title = df.loc[i, 'doctitle']
+    #     rs = predict(doc_id,text)
+    #     rs = json.loads(rs)
+    #     prem = json.dumps(rs['prem'], ensure_ascii=False)
     #     # print(rs)
-    #     new_prem.append(rs)
+    #     new_prem.append(prem)
+    #     print(prem)
+    #     break
     # df['new_prem'] = pd.Series(new_prem)
     # print('耗时:', time.time()-t1)
-    # df.to_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0813.xlsx')
-    pass
+    # # df.to_excel('E:/大网站规则识别/大网站要素提取结果20211115.xlsx')
+    # df.to_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115.xlsx')
+    # # pass

+ 2 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -952,7 +952,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                                                          entity_after.values[entity_after.label])
                                         entity.pointer_money = entity_after
                                         # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)
-                                        break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
+                                        if entity_after.values[entity_after.label]>0.6:
+                                            break # 2021/7/16 新增,找到中标金额,非单价即停止,不再往后找金额
                                     #add pointer_money
                                     # entity.pointer_money = entity_after
                                     # print('role zhao money', entity.entity_text, '中标金额:', entity_after.entity_text)

+ 39 - 28
BiddingKG/dl/interface/predictor.py

@@ -676,10 +676,10 @@ class PREMPredict():
         for i in range(len(predict_y)):
             entity = points_entitys[i]
             label = np.argmax(predict_y[i])
-            values = []
-            for item in predict_y[i]:
-                values.append(item)
-                entity.set_Money(label,values)
+            values = predict_y[i]
+            if label ==0 and entity.notes=="投资":
+                values[label] = 0.49
+            entity.set_Money(label, values)
         
     def predict(self,list_sentences,list_entitys):
         self.predict_role(list_sentences,list_entitys)
@@ -1065,25 +1065,25 @@ class FormPredictor():
 class RoleRulePredictor():
     
     def __init__(self):
-        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|转让|招租|甲|议标|合同主体|比选)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|业主名称|需方|询价单位)(是|为|信息|:|:|\s*$))"
+        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|转让|招租|甲|议标|合同主体|比选)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|业主名称|需方|询价单位)(是|为|信息|:|:|\s*)$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
-        self.pattern_tenderee_right = "(?P<tenderee_right>(\((以下简称)?[\"”]?(招标|采购)(人|单位|机构)\)?)|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)"
+        self.pattern_tenderee_right = "(?P<tenderee_right>^(\((以下简称)?[\"”]?(招标|采购)(人|单位|机构)\)?))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         
-        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|招标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*$)|(受.{,20}委托))"
-        self.pattern_agency_right = "(?P<agency_right>(\((以下简称)?[\"”]?(代理)(人|单位|机构)\))|受.*委托)"
+        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|招标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{,20}委托))"
+        self.pattern_agency_right = "(?P<agency_right>^(\((以下简称)?[\"”]?(代理)(人|单位|机构)\))|受.{,15}委托)"
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
-        self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[^必须]{,4}[::是为]|(供应商|供货商|服务商|选定单位|指定的中介服务机构))[^必须]{,4}[::是为].{,2}|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)|((中标|成交)(结果|信息))(是|为|:|:|\s*$)|(单一来源采购(供应商|供货商|服务商))|((分包|标包).*供应商|供应商名称|服务机构|供方[::]))"
-        self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[^必须]{,4}[::是为])"
-        self.pattern_winTenderer_right = "(?P<winTenderer_right>[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))"
-        self.pattern_winTenderer_whole = "(?P<winTenderer_whole>贵公司.*以.*中标|最终由.*竞买成功|经.*[以由].*中标|成交供应商,成交供应商名称:|谈判结果:由.{5,20}供货)"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
+        self.pattern_winTenderer_left = "(?P<winTenderer_left>((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|(选定单位|指定的中介服务机构))[::是为,]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[::是为]+$|((评审结果|名次|排名)[::]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|:|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[::]$)"
+        # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
+        self.pattern_winTenderer_right = "(?P<winTenderer_right>^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))"
+        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
 
-        self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[^必须]{,4}[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
+        # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
 
-        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(是|为|:|:|\s*$))|((评审结果|名次|排名)[::]第?[二2]名?))"
-        self.pattern_secondTenderer_right = "(?P<secondTenderer_right>[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
+        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
+        self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
         
-        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))|((评审结果|名次|排名)[::]第?[三3]名?))"
-        self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
+        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
+        self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
         
         self.dict_list_pattern = {"0":[["L",self.pattern_tenderee_left],
                                       ["C",self.pattern_tenderee_center],
@@ -1091,7 +1091,7 @@ class RoleRulePredictor():
                                  "1":[["L",self.pattern_agency_left],
                                       ["R",self.pattern_agency_right]],
                                  "2":[["L",self.pattern_winTenderer_left],
-                                      ["C",self.pattern_winTenderer_center],
+                                      # ["C",self.pattern_winTenderer_center],
                                       ["R",self.pattern_winTenderer_right],
                                       ["W",self.pattern_winTenderer_whole]],
                                  "3":[["L",self.pattern_secondTenderer_left],
@@ -1183,7 +1183,7 @@ class RoleRulePredictor():
 
                                 #使用正则+距离解决冲突
                                 # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
-                                list_spans = [spans[0][-30:],spans[0][-20:]+spans[1],spans[2]]
+                                list_spans = [spans[0][-30:],spans[0][-10:]+spans[1]+spans[2][:10],spans[2]]
                                 for _i_span in range(len(list_spans)):
                                     # print(list_spans[_i_span],p_entity.entity_text)
                                     for _pattern in self.pattern_whole:
@@ -1710,11 +1710,14 @@ class ProductAttributesPredictor():
             order_begin = "%s-%s-01" % (year, month)
             order_end = "%s-%s-%s" % (year, month, num)
             return order_begin, order_end
-        if  re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)\d{1,2}日?$', text):
-            text = re.sub('年|月|/|-', '-', text)
-            text = text.replace('日', '')
-            order_begin = text
-            order_end = text
+        t2 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)(\d{1,2})日?$', text)
+        if t2:
+            y = t2.group(1)
+            m = t2.group(3)
+            d = t2.group(5)
+            m = '0'+ m if len(m)<2 else m
+            d = '0'+d if len(d)<2 else d
+            order_begin = order_end = "%s-%s-%s"%(y,m,d)
             return order_begin, order_end
         all_match = re.finditer('^(?P<y1>\d{4})(年|/|.)(?P<m1>\d{1,2})(?:(月|/|.)(?:(?P<d1>\d{1,2})日)?)?'
                                 '(到|至|-)(?:(?P<y2>\d{4})(年|/|.))?(?P<m2>\d{1,2})(?:(月|/|.)'
@@ -1743,9 +1746,10 @@ class ProductAttributesPredictor():
         y2 = y1 if y2 == "" else y2
         d1 = '1' if d1 == "" else d1
         d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
-        for it in (m1,d1,m2,d2):
-            if len(it)<2:
-                it = '0'+it
+        m1 = '0' + m1 if len(m1) < 2 else m1
+        m2 = '0' + m2 if len(m2) < 2 else m2
+        d1 = '0' + d1 if len(d1) < 2 else d1
+        d2 = '0' + d2 if len(d2) < 2 else d2
         order_begin = "%s-%s-%s"%(y1,m1,d1)
         order_end = "%s-%s-%s"%(y2,m2,d2)
         return order_begin, order_end
@@ -2101,6 +2105,8 @@ class DocChannel():
       doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
     else:
       doc_sens = ' '.join(doc_word_list[:self.sequen_len])
+    # print('标题:',segword_title)
+    # print('正文:',segword_content)
     datas.append(doc_sens.split())
     datas_title.append(segword_title.split())
     # print('完成预处理')
@@ -2131,7 +2137,10 @@ class DocChannel():
       tokens = [it for l in token_l for it in l]
       content = ' '.join(tokens[:500])
 
-    data_content, data_title = self.predict_process(docid='', doctitle=title[:50], dochtmlcon=content) # 标题最多取50字
+    title = re.sub('[^\u4e00-\u9fa5]', '', title)
+    if len(title)>50:
+        title = title[:20]+title[-30:]
+    data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
     text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
     title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
 
@@ -2147,6 +2156,7 @@ class DocChannel():
                             )
     id = np.argmax(pred, axis=1)[0]
     prob = pred[0][id]
+    # print('公告类别:', self.id2type[id], '概率:',prob)
     if id == 0:
       pred = self.lift_sess.run(self.lift_softmax,
                                       feed_dict={
@@ -2158,6 +2168,7 @@ class DocChannel():
                               )
       id = np.argmax(pred, axis=1)[0]
       prob = pred[0][id]
+      # print('生命周期:',self.id2life[id], '概率:',prob)
       if id == 6:
         if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
           # return '候选人公示', prob