ソースを参照

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	BiddingKG/dl/interface/Preprocessing.py
#	BiddingKG/dl/interface/extract.py
znj 3 年 前
コミット
1486f19438

+ 2 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -356,8 +356,9 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
 
 def isLegalEnterprise(name):
     is_legal = True
-    if re.search("^[省市区县]",name) is not None or re.search("^.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会)$",name) or re.search("标段|标包|名称",name) is not None:
+    if re.search("^[省市区县]",name) is not None or re.search("^\**.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会|有限公司)$",name) or re.search("标段|标包|名称",name) is not None:
         is_legal = False
+        print("is_legal:", name , is_legal)
     return is_legal
 
 def fix_LEGAL_ENTERPRISE():

+ 82 - 15
BiddingKG/dl/interface/Preprocessing.py

@@ -407,7 +407,7 @@ def tableToText(soup):
         # for item in inner_table:
         #     print(item)
         # print("======")
-        
+
         repairTable(inner_table)
         head_list = sliceTable(inner_table)
 
@@ -635,15 +635,15 @@ def tableToText(soup):
                         count_flag = False
             if count_flag and len(count_set)>=2:
                 return "column"
-        if count_column_keys>count_row_keys:
-            return "column"
+        # if count_column_keys>count_row_keys:  #2022/2/15 此项不够严谨,造成很多错误,故取消
+        #     return "column"
         return "row"
         
             
     #根据表格处理方向生成句子,        
     def getTableText(inner_table,head_list,key_direct=False):
         # packPattern = "(标包|[标包][号段名])"
-        packPattern = "(标包|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
+        packPattern = "(标包|标的|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
         rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见)"  # 2020/11/23 大网站规则,添加序号为排序
         entityPattern = "((候选|([中投]标|报价))(单位|公司|人|供应商))"
         moneyPattern = "([中投]标|报价)(金额|价)"
@@ -658,7 +658,6 @@ def tableToText(soup):
                 
             direct = getDirect(inner_table, head_begin, head_end)
 
-
             #若只有一行,则直接按行读取
             if head_end-head_begin==1:
                 text_line = ""
@@ -707,9 +706,9 @@ def tableToText(soup):
                                 if table_occurence[i-loop_i][j]["type"] in key_values:
                                     if find_flag:
                                         if table_occurence[i-loop_i][j]["text"]!=temp_head:
-                                            top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
+                                            top_head = table_occurence[i-loop_i][j]["text"]+""+top_head
                                     else:
-                                        top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
+                                        top_head = table_occurence[i-loop_i][j]["text"]+""+top_head
                                     find_flag = True
                                     temp_head = table_occurence[i-loop_i][j]["text"]
                                     table_occurence[i-loop_i][j]["occu_count"] += 1
@@ -733,9 +732,9 @@ def tableToText(soup):
                                 if table_occurence[i][j-loop_j]["type"] in key_values:
                                     if find_flag:
                                         if table_occurence[i][j-loop_j]["text"]!=temp_head:
-                                            left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
+                                            left_head = table_occurence[i][j-loop_j]["text"]+""+left_head
                                     else:
-                                        left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
+                                        left_head = table_occurence[i][j-loop_j]["text"]+""+left_head
                                     find_flag = True
                                     temp_head = table_occurence[i][j-loop_j]["text"]
                                     table_occurence[i][j-loop_j]["occu_count"] += 1
@@ -757,7 +756,7 @@ def tableToText(soup):
                             if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
 
                                 cell = table_occurence[i][j]
-                                head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
+                                head = (cell["top_head"]+"") if len(cell["top_head"])>0 else ""
                                 if re.search("单报标限总]价|金额|成交报?价|报价", head):
                                     head = cell["left_head"] + head
                                 else:
@@ -994,12 +993,22 @@ def tableToText(soup):
 
 
         
-    pat_head = re.compile('^(名称|序号|项目|标项|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|标包|分包|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理|制造|企业资质|质量目标|工期目标)$')
+    pat_head = re.compile('^(名称|序号|项目|标项|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|标包|分包|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理|制造|企业资质|质量目标|工期目标|(需求|服务|项目|施工|采购|招租|出租|转让|出让|业主|询价|委托|权属|招标|竞得|抽取|承建)(人|方|单位)(名称)?|(供应商|供货商|服务商)(名称)?)$')
     #pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
     pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
 
     list_innerTable = []
 
+    # 2022/2/9 删除干扰标签
+    for tag in soup.find_all('option'): #例子: 216661412
+        if 'selected' not in tag.attrs:
+            tag.extract()
+    for ul in soup.find_all('ul'): #例子 156439663 多个不同channel 类别的标题
+        if ul.find_all('li') == ul.findChildren(recursive=False) and len(set(re.findall(
+                '招标公告|中标结果公示|中标候选人公示|招标答疑|开标评标|合同履?约?公示|开标评标|资格评审',
+                ul.get_text(), re.S)))>3:
+            ul.extract()
+
     tbodies = soup.find_all('table')
     # 遍历表格中的每个tbody
     #逆序处理嵌套表格
@@ -1570,7 +1579,7 @@ def union_ner(list_ner):
 #         list_entitys.append(list_entitys_temp)
 #     return list_articles,list_sentences,list_entitys,cost_time
 
-def get_preprocessed(articles,useselffool=False):
+def get_preprocessed(articles, useselffool=False):
     '''
     @summary:预处理步骤,NLP处理、实体识别
     @param:
@@ -1585,7 +1594,57 @@ def get_preprocessed(articles,useselffool=False):
     calibrateEnterprise(list_articles,list_sentences,list_entitys)
 
     return list_articles,list_sentences,list_entitys,list_outlines,cost_time
-    
+
+def special_treatment(sourceContent, web_source_no):
+    if web_source_no == 'DX000202-1':
+         sourceContent
+         ser = re.search('中标供应商及中标金额:【((\w{5,20}-[\d,.]+,)+)】', sourceContent)
+         if ser:
+             new = ""
+             l = ser.group(1).split(',')
+             for i in range(len(l)):
+                 it = l[i]
+                 if '-' in it:
+                     role, money = it.split('-')
+                     new += '标段%d, 中标供应商: ' % (i + 1) + role + ',中标金额:' + money + '。'
+             sourceContent = sourceContent.replace(ser.group(0), new, 1)
+    elif web_source_no == '03786-10':
+        ser1 = re.search('中标价:([\d,.]+)', sourceContent)
+        ser2 = re.search('合同金额[((]万元[))]:([\d,.]+)', sourceContent)
+        if ser1 and ser2:
+            m1 = ser1.group(1).replace(',', '')
+            m2 = ser2.group(1).replace(',', '')
+            if float(m1) < 100000 and (m1.split('.')[0] == m2.split('.')[0] or m2 == '0'):
+                new = '中标价(万元):' + m1
+                sourceContent = sourceContent.replace(ser1.group(0), new, 1)
+    elif web_source_no=='00076-4':
+        ser = re.search('主要标的数量:([0-9一]+)\w{,3},主要标的单价:([\d,.]+)元?,合同金额:(.00),', sourceContent)
+        if ser:
+            num = ser.group(1).replace('一', '1')
+            try:
+                num = 1 if num == '0' else num
+                unit_price = ser.group(2).replace(',', '')
+                total_price = str(int(num) * float(unit_price))
+                new = '合同金额:' + total_price
+                sourceContent = sourceContent.replace('合同金额:.00', new, 1)
+            except Exception as e:
+                log('preprocessing.py special_treatment exception')
+    elif web_source_no=='DX000105-2':
+        if re.search("成交公示", sourceContent) and re.search(',投标人:', sourceContent) and re.search(',成交人:', sourceContent)==None:
+            sourceContent = sourceContent.replace(',投标人:', ',成交人:')
+    elif web_source_no in ['04080-3', '04080-4']:
+        ser = re.search('合同金额:([0-9,]+.[0-9]{3,})(.{,4})', sourceContent)
+        if ser and '万' not in ser.group(2):
+            sourceContent = sourceContent.replace('合同金额:', '合同金额(万元):')
+    elif web_source_no=='03761-3':
+        ser = re.search('中标价,([0-9]+)[.0-9]*%', sourceContent)
+        if ser and int(ser.group(1))>100:
+            sourceContent = sourceContent.replace(ser.group(0), ser.group(0)[:-1]+'元')
+    elif web_source_no=='00695-7':
+        ser = re.search('支付金额:', sourceContent)
+        if ser:
+            sourceContent = sourceContent.replace('支付金额:', '合同金额:')
+    return sourceContent
 
 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
     '''
@@ -1612,6 +1671,11 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         _send_doc_id = article[3]
         _title = article[4]
         page_time = article[5]
+        web_source_no = article[6]
+        '''特别数据源对 html 做特别修改'''
+        if web_source_no in ['DX000202-1']:
+            sourceContent = special_treatment(sourceContent, web_source_no)
+
         #表格处理
         key_preprocess = "tableToText"
         start_time = time.time()
@@ -1631,6 +1695,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
+        '''特别数据源对 预处理后文本 做特别修改'''
+        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7']:
+            article_processed = special_treatment(article_processed, web_source_no)
 
         # 提取bidway
         list_bidway = extract_bidway(article_processed, _title)
@@ -1870,7 +1937,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             '''正则识别角色实体  经营部|经销部|电脑部|服务部|复印部|印刷部|彩印部|装饰部|修理部|汽修部|修理店|零售店|设计店|服务店|家具店|专卖店|分店|文具行|商行|印刷厂|修理厂|维修中心|修配中心|养护中心|服务中心|会馆|文化馆|超市|门市|商场|家具城|印刷社|经销处'''
             for it in re.finditer(
-                    '(?P<text_key_word>[^,。、;《]{,5}(单一来源|中标|中选|中价|成交)?(供应商|供货商|服务商|候选人|单位|人)(名称)?[为::]+)(?P<text>([^,。、;《]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[,。]',
+                    '(?P<text_key_word>(((单一来源|中标|中选|中价|成交)(供应商|供货商|服务商|候选人|单位|人))|(供应商|供货商|服务商|候选人))(名称)?[为::]+)(?P<text>([^,。、;《::]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[,。]',
                     sentence_text):
                 for k, v in it.groupdict().items():
                     if k == 'text_key_word':
@@ -2149,7 +2216,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         else:
                             entity_text = str(getUnifyMoney(entity_text))
 
-                    if float(entity_text)<100 or float(entity_text)>100000000000:
+                    if float(entity_text)>100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
                         # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
                         continue
 

+ 27 - 10
BiddingKG/dl/interface/extract.py

@@ -42,19 +42,19 @@ class MyEncoder(json.JSONEncoder):
             return obj
         return json.JSONEncoder.default(self, obj)
 
-def predict(doc_id,text,title="",page_time="",**kwargs):
+def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
     cost_time = dict()
 
     start_time = time.time()
     log("start process doc %s"%(str(doc_id)))
-    list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
+    list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
     log("get preprocessed done of doc_id%s"%(doc_id))
     cost_time["preprocess"] = round(time.time()-start_time,2)
     cost_time.update(_cost_time)
 
     #依赖句子顺序
     start_time = time.time() # 公告类型/生命周期提取
-    list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
+    channel_dic = predictor.getPredictor("channel").predict(title=title, list_sentence=list_sentences[0], web_source_no=web_source_no)
     cost_time["channel"] = round(time.time()-start_time,2)
 
     start_time = time.time() # 项目编号、名称提取
@@ -68,14 +68,14 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
     cost_time["prem"] = round(time.time()-start_time,2)
 
     start_time = time.time() # 产品名称及废标原因提取
-    fail = list_channel_dic[0]['docchannel'] == "废标公告"
+    fail = channel_dic['docchannel']['docchannel'] == "废标公告"
     fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
     log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
 
     start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
-    product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
+    product_attrs, total_product_money = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
     log("get product attributes done of doc_id%s"%(doc_id))
     cost_time["product_attrs"] = round(time.time()-start_time,2)
 
@@ -85,7 +85,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
 
     '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
     start_time = time.time() #正则角色提取
-    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys)
+    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys, codeName)
     cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
 
     start_time = time.time() #联系人模型提取
@@ -109,7 +109,7 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
             for _entity in list_entity:
                 # print('keyword:',keyword, '_entity.notes :',_entity.notes)
                 if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label==2:
-                    if list_channel_dic[0]['docchannel'] == "招标公告":
+                    if channel_dic['docchannel'] == "招标公告":
                         _entity.values[0] = 0.51
                         _entity.set_Money(0, _entity.values)  #2021/11/18 根据公告类别把费用改为招标或中投标金额
                     else:
@@ -138,15 +138,32 @@ def predict(doc_id,text,title="",page_time="",**kwargs):
                 if product in d['project_name']:
                     d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
 
+    '''修正采购公告表格形式多种采购产品中标价格'''
+    if total_product_money>0 and len(prem[0]['prem'])==1:
+        for value in prem[0]['prem'].values():
+            for l in value['roleList']:
+                try:
+                    if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
+                        l[2] = total_product_money
+                        log('修改中标金额为所有产品总金额')
+                except Exception as e:
+                    log('表格产品价格修正中标价格报错:%s'%e)
+
+    '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别'''
+    start_time = time.time()
+    content = list_articles[0].content
+    channel_dic = predictor.getPredictor("channel").predict_rule(title, content, channel_dic, prem_dic=prem[0]['prem'])
+    cost_time["rule_channel"] = round(time.time()-start_time,2)
+
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason)
+    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason)
     data_res["cost_time"] = cost_time
     data_res["success"] = True
 
     # for _article in list_articles:
-    #     log(_article.content)
-
+    #         log(_article.content)
+    #
     # for list_entity in list_entitys:
     #     for _entity in list_entity:
     #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%

+ 4 - 4
BiddingKG/dl/interface/getAttributes.py

@@ -560,11 +560,11 @@ def getPackagesFromArticle(list_sentence,list_entity):
     dict_packageCode = dict()
     
     package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
-    package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
-    package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
+    package_N_name_pattern = re.compile("(([^承]|^)分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
+    package_number_pattern = re.compile("(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")  # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
     # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)')  # 新正则识别标段
-    other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[::]([^,。]{2,50}?)[,。]')  #  # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
-    win_tenderer_pattern = re.compile('(中标人|供应商)[::](.{2,25})[,。]')  # 2020/11/23 大网站规则 调整
+    other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]')  #  # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
+    win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]')  # 2020/11/23 大网站规则 调整
     model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]')  # 2020/11/23 大网站规则 调整
     number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
 

+ 136 - 40
BiddingKG/dl/interface/predictor.py

@@ -667,8 +667,12 @@ class PREMPredict():
                 elif re.search('尊敬的供应商:.{,25}我公司', text):
                     label = 0
                     values[label] = 0.801
-            if label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
+            elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
                 label = 0
+                values[label] = 0.501
+            elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', text[:-10]):
+                label = 2
+                values[label] = 0.501
             entity.set_Role(label, values)
 
     def predict_money(self,list_sentences,list_entitys):
@@ -1096,25 +1100,25 @@ class RoleRulePredictor():
     def __init__(self):
         # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
         self.pattern_tenderee_left = "(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
-                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
-                                "[))]?(信息[,:])?(名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
+                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
+                                "[))]?(信息[,:])?(名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
         self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询价|评选|谈判|邀标|邀请|洽谈|约谈)" \
                                      "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
-                                     "(名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
-        self.pattern_tenderee_center = "(?P<tenderee_center>(受.{,20}委托))"
-        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^拟对|^现就|^现委托)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
-
-        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|[招议))]+标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
-        self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托)"  # |^受托  会与 受托生产等冲突,暂时为发现受托表达代理方式
+                                     "(名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
+        self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托))"
+        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^现委托|^的\w{2,10}正在进行)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
+        self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
+        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
+        self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
-        self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
-                                        "(选定单位|指定的中介服务机构|实施主体|承制单位)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::,]*$|" \
-                                        "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))(是|为|:|:)$|(供应|供货|供|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]$)"
-        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$)"
+        self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
+                                        "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
+                                        "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
+        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
         # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
         # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
         self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
-                                        "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^你方于))"
+                                        "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^成为[\w、()()]+项目的成交供应商))"
         self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)|中标通知书.{,15}你方"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
 
         # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
@@ -1129,6 +1133,7 @@ class RoleRulePredictor():
                               self.pattern_tenderee_left_w1,
                               self.pattern_tenderee_center,
                               self.pattern_tenderee_right,
+                              self.pattern_tendereeORagency_right,
                               self.pattern_agency_left,
                               self.pattern_agency_right,
                               self.pattern_winTenderer_left,
@@ -1194,6 +1199,8 @@ class RoleRulePredictor():
                                         find_flag = True
                                         if p_entity.values[0] > on_value:
                                             p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
+                                        else:
+                                            p_entity.values[0] = on_value  # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
                         if find_flag:
                             continue
 
@@ -1258,11 +1265,20 @@ class RoleRulePredictor():
                                             for _group, _v_group in _iter.groupdict().items():
                                                 if _v_group is not None and _v_group != "":
                                                     _role = _group.split("_")[0]
+                                                    if _role == "tendereeORagency":   # 2022/3/9 新增不确定招标代理判断逻辑
+                                                        print('p_entity_sentenceindex:', p_entity.sentence_index)
+                                                        if p_entity.sentence_index>=1:  # 只在第一句进行这种模糊匹配
+                                                            continue
+                                                        if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
+                                                            or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
+                                                            _role = 'tenderee'
+                                                        else:
+                                                            _role = "agency"
                                                     _direct = _group.split("_")[1]
                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商',
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|交易服务单位',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
                                                                                                         list_spans[
                                                                                                             0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
@@ -1369,12 +1385,13 @@ class RoleRulePredictor():
 
 '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
 class RoleRuleFinalAdd():
-    def predict(self, list_articles, list_entitys):
+    def predict(self, list_articles, list_entitys, list_codenames):
         text_end = list_articles[0].content[-40:]
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
-        sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十]{2,4}年.{1,2}月.{1,2}日', text_end)
+        sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
         sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
         sear_ent3 = re.search('(报名咨询|收货地点|送货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
+
         if sear_ent or sear_ent2 or sear_ent3:
             if sear_ent3:
                 ent_re = sear_ent3.group(2)
@@ -1394,31 +1411,53 @@ class RoleRuleFinalAdd():
                         agency_notfound = False
                     elif ent.label == 5:
                         ents.append(ent)
-            if agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
+            if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
+                                              or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
                 n = 0
                 for i in range(len(ents) - 1, -1, -1):
                     n += 1
-                    if n > 3 and sear_ent:  # 文章末尾角色加日期这种只找后三个实体
+                    if n > 3 and sear_ent: # 文章末尾角色加日期这种只找后三个实体
                         break
                     if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
-                        ents[i].label = 1
-                        ents[i].values[1] = 0.5
+                        ents[i].label = 0
+                        ents[i].values[0] = 0.5
                         # log('正则最后补充实体: %s'%(ent_re))
                         break
-
-            elif tenderee_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None:
+            elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
                 n = 0
                 for i in range(len(ents) - 1, -1, -1):
                     n += 1
-                    if n > 3 and sear_ent: # 文章末尾角色加日期这种只找后三个实体
+                    if n > 3 and sear_ent:  # 文章末尾角色加日期这种只找后三个实体
                         break
                     if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
-                        ents[i].label = 0
-                        ents[i].values[0] = 0.5
+                        ents[i].label = 1
+                        ents[i].values[1] = 0.5
                         # log('正则最后补充实体: %s'%(ent_re))
                         break
 
 
+        elif list_codenames[0]['name'] != "":  #把标题包含的公司实体作为招标人
+            tenderee_notfound = True
+            ents = []
+            for ent in list_entitys[0]:
+                if ent.entity_type in ['org', 'company']:
+                    if ent.label == 0:
+                        tenderee_notfound = False
+                    elif ent.label == 1:
+                        agency_notfound = False
+                    elif ent.label == 5:
+                        ents.append(ent)
+            if tenderee_notfound == True:
+                print('list_codenames',list_codenames[0]['name'])
+                for ent in ents:
+                    if ent.entity_text in list_codenames[0]['name']:
+                        ent.label = 0
+                        ent.values[0] = 0.5
+                        # log('正则召回标题中包含的实体:%s'%ent.entity_text)
+                        break
+
+
+
 # 时间类别
 class TimePredictor():
     def __init__(self):
@@ -1923,7 +1962,7 @@ class ProductAttributesPredictor():
             for j in range(i + 1, len(items)):
                 if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
                     continue
-                if re.search('数量', items[j]):
+                if header_dic['数量']=="" and re.search('数量', items[j]):
                     header_dic['数量'] = j
                     quantity = items[j]
                 elif re.search('单价', items[j]):
@@ -1972,6 +2011,7 @@ class ProductAttributesPredictor():
         header_col = []
         product_link = []
         demand_link = []
+        total_product_money = 0
         for i in range(len(tables)-1, -1, -1):
             table = tables[i]
             if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
@@ -2111,6 +2151,12 @@ class ProductAttributesPredictor():
                                                       'brand': brand[:50], 'specs':specs}
                             if link not in product_link:
                                 product_link.append(link)
+                                mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
+                                if link['unitPrice'] != "" and mat:
+                                    try:
+                                        total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', ''))
+                                    except:
+                                        log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
                         if budget != "" and order_time != "" :
                             link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
                             if link not in demand_link:
@@ -2126,7 +2172,7 @@ class ProductAttributesPredictor():
             demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
         else:
             demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
-        return [attr_dic, demand_dic]
+        return [attr_dic, demand_dic], total_product_money
 
 # docchannel类型提取
 class DocChannel():
@@ -2260,10 +2306,12 @@ class DocChannel():
     else:
       return 0
 
-  def predict(self, title='', content=''):
-    # print('准备预测')
-    if isinstance(content, list):
-      token_l = [it.tokens for it in content]
+  def predict(self, title='', list_sentence='', web_source_no=''):
+    if web_source_no in ['02104-7']:
+      return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}}
+
+    if isinstance(list_sentence, list):
+      token_l = [it.tokens for it in list_sentence]
       tokens = [it for l in token_l for it in l]
       content = ' '.join(tokens[:500])
 
@@ -2273,6 +2321,7 @@ class DocChannel():
     data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
     text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
     title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
+    result = {'docchannel': {'docchannel':'', 'doctype':''}}
 
     array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
     array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
@@ -2286,8 +2335,10 @@ class DocChannel():
                             )
     id = np.argmax(pred, axis=1)[0]
     prob = pred[0][id]
+    result['docchannel']['doctype'] = self.id2type[id]
     # print('公告类别:', self.id2type[id], '概率:',prob)
-    if id == 0:
+    # if id == 0:
+    if result['docchannel']['doctype'] not in ['', '新闻资讯']:
       pred = self.lift_sess.run(self.lift_softmax,
                                       feed_dict={
                                                 self.lift_title: array_title,
@@ -2298,16 +2349,61 @@ class DocChannel():
                               )
       id = np.argmax(pred, axis=1)[0]
       prob = pred[0][id]
+      result['docchannel']['docchannel'] = self.id2life[id]
       # print('生命周期:',self.id2life[id], '概率:',prob)
-      if id == 6:
+      # if id == 6:
+      if result['docchannel']['docchannel'] == '中标信息':
         if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
+          result['docchannel']['docchannel'] = '候选人公示'
           # return '候选人公示', prob
-          return [{'docchannel': '候选人公示'}]
-      # return self.id2life[id], prob
-      return [{'docchannel':self.id2life[id]}]
-    else:
-      # return self.id2type[id], prob
-      return [{'docchannel':self.id2type[id]}]
+          # return [{'docchannel': '候选人公示'}]
+
+    return result
+    #   return [{'docchannel':self.id2life[id]}]
+    # else:
+    #   # return self.id2type[id], prob
+    #   return [{'docchannel':self.id2type[id]}]
+
+  def predict_rule(self, title, content, channel_dic, prem_dic):
+      '''2022/2/10加入规则去除某些数据源及内容过短且不包含类别关键词的公告不做预测'''
+      hetong = '(合同|验收|履约)(公告|公示)|合同号?$'  # 合同标题正则
+      zhongbiao_t = '(中标|中选|成交|入选|入围|结果|确认)(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选)结果|开标(记录|信息|情况)|单一来源|直接(选取|选定)|中标通知书|中标$'
+      zhongbiao_c = '(中标|中选|成交|拟选用|拟邀请|最终选定的?|拟定)(供应商|供货商|服务商|企业|公司|单位|(候选)?人)(名称)?[::]|[,。:.](供应商|供货商|服务商)(名称)?:|指定的中介服务机构:|建设服务单位:'
+      zhaobiao_t = '(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈)(公告|公示|$)'
+      title_cn = re.sub('[^\u4e00-\u9fa5]', '', title)
+      if len(re.sub('[^\u4e00-\u9fa5]', "", content))<50 and channel_dic['docchannel']['doctype'] != '新闻资讯':
+          if re.search(hetong, title_cn) != None:
+              channel_dic['docchannel']['docchannel'] = '合同公告'
+          elif re.search(zhongbiao_t, title_cn):
+              channel_dic['docchannel']['docchannel'] = '中标信息'
+          elif re.search(zhaobiao_t, title_cn):
+              channel_dic['docchannel']['docchannel'] = '招标公告'
+          else:
+              channel_dic['docchannel']['docchannel'] = ''
+      elif channel_dic['docchannel'].get('docchannel', '') == '招标公告' and 'win_tenderer' in json.dumps(prem_dic,
+                                                                                              ensure_ascii=False):
+          if re.search(hetong, title_cn) != None:
+              channel_dic['docchannel']['docchannel'] = '合同公告'
+              log('正则把招标公告修改为合同公告')
+          elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
+                                                                                                      content):
+              channel_dic['docchannel']['docchannel'] = '中标信息'
+              log('正则把招标公告修改为中标信息')
+      elif channel_dic['docchannel'].get('docchannel', '') == '中标信息' and 'win_tenderer' not in json.dumps(prem_dic,
+                                                                                                    ensure_ascii=False):
+          if re.search(hetong, title_cn):
+              channel_dic['docchannel']['docchannel'] = '合同公告'
+              log('正则把中标信息修改为合同公告')
+          elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
+                                                                                                      content):
+              pass
+          elif re.search(zhaobiao_t, title_cn):
+              channel_dic['docchannel']['docchannel'] = '招标公告'
+              log('正则把中标信息修改为招标公告')
+          elif re.search('中标|成交|中选|入选|入围|结果|供应商|供货商|候选人', title_cn+content)==None:
+              channel_dic['docchannel']['docchannel'] = ''
+              log('正则把中标信息修改为空')
+      return channel_dic
 
 # 保证金支付方式提取
 class DepositPaymentWay():