Explorar o código

Merge remote-tracking branch 'origin/master'

fangjiasheng hai 9 meses
pai
achega
8e62db7d57

+ 103 - 2
BiddingKG/dl/common/Utils.py

@@ -956,11 +956,20 @@ def money_process(money_text, header):
         money_re = re_price.group(0)
         if (re.search('万元|[((]万[))]',  header) or re.search('万元|[((]万[))]', money_text)) and '万' not in money_re:  # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面
             money_re += '万元'
+        elif (re.search('亿元|[((]亿[))]',  header) or re.search('亿元|[((]亿[))]', money_text)) and '亿' not in money_re:  # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面
+            money_re += '亿元'
         # money = float(getUnifyMoney(money_text))
         money = float(getUnifyMoney(money_re))
         if money > 10000000000000:  # 大于万亿的去除
             money = 0
-        money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
+        # money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
+        if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None:
+            if '万' in money_re:
+                money_unit = '万元'
+            elif '亿' in money_re:
+                money_unit = '亿元'
+            else:
+                money_unit = '元'
     return (money, money_unit)
 
 package_number_pattern = re.compile(
@@ -968,7 +977,7 @@ package_number_pattern = re.compile(
 |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
 |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
 |((标[段包项]|品目|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
-|[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
+|([,;。、:(]|^)(标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
 |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,9}[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{0,9})\
 |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
 filter_package_pattern =  'CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
@@ -1021,6 +1030,98 @@ def find_package(content):
         # print('提取到标段:%s, 前后文:%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
     return packages
 
+def cut_repeat_name(s):
+    '''
+    公司连续重复名称去重
+    :param s:
+    :return:
+    '''
+    if len(s) >= 8:
+        n = s.count(s[-4:])
+        id = s.find(s[-4:]) + 4
+        sub_s = s[:id]
+        if n>=2 and s == sub_s * n:
+            s = sub_s
+    return s
+
+def del_tabel_achievement(soup):
+    if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
+        return None
+    p1 = '(中标|成交)(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
+    '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除'''
+    for tag in soup.find_all('table'):
+        pre_text = ""
+        if tag.findPreviousSibling() != None:
+            pre_text = tag.findPreviousSibling().text.strip()
+            if pre_text == "" and tag.findPreviousSibling().findPreviousSibling() != None: # 修复表格前一标签没内容,再前一个才有内容情况
+                pre_text = tag.findPreviousSibling().findPreviousSibling().text.strip()
+
+        tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else ""
+        #     print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text))
+        if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
+            _count = 0
+            for td in tag.find('tr').find_all('td'):
+                td_text = td.text.strip()
+                if len(td_text) > 25:
+                    break
+                if len(td_text) < 25 and re.search('中标候选人|第[一二三四五1-5]候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
+                    _count += 1
+                if _count >=2:
+                    pre_tag = tag.findPreviousSibling().extract()
+                    del_tag = tag.extract()
+                    # print('删除表格业绩内容', pre_tag.text + del_tag.text)
+                    break
+        elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100:
+            del_tag = tag.extract()
+            # print('删除表格业绩内容', del_tag.text)
+    del_trs = []
+    '''删除表格某些行公布的业绩信息'''
+    for tag in soup.find_all('table'):
+        text = tag.text
+        if re.search('业绩', text) == None:
+            continue
+        # for tr in tag.find_all('tr'):
+        trs = tag.find_all('tr')
+        i = 0
+        while i < len(trs):
+            tr = trs[i]
+            if len(tr.find_all('td'))==2 and tr.td!=None and tr.td.findNextSibling()!=None:
+                td1_text =tr.td.text
+                td2_text =tr.td.findNextSibling().text
+                if re.search('业绩', td1_text)!=None and len(td1_text)<10 and len(re.findall('(\d、|(\d))?[-\w()、]+(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)', td2_text))>=2:
+                    # del_tag = tr.extract()
+                    # print('删除表格业绩内容', del_tag.text)
+                    del_trs.append(tr)
+            elif tr.td != None and re.search('^业绩|业绩$', tr.td.text.strip()) and len(tr.td.text.strip())<25:
+                rows = tr.td.attrs.get('rowspan', '')
+                cols = tr.td.attrs.get('colspan', '')
+                if rows.isdigit() and int(rows)>2:
+                    for j in range(int(rows)):
+                        if i+j < len(trs):
+                            del_trs.append(trs[i+j])
+                    i += j
+                elif cols.isdigit() and int(cols)>3 and len(tr.find_all('td'))==1 and i+2 < len(trs):
+                    next_tr_cols = 0
+                    td_num = 0
+                    for td in trs[i+1].find_all('td'):
+                        td_num += 1
+                        if td.attrs.get('colspan', '').isdigit():
+                            next_tr_cols += int(td.attrs.get('colspan', ''))
+                    if next_tr_cols == int(cols):
+                        del_trs.append(tr)
+                        for j in range(1,len(trs)-i):
+                            if len(trs[i+j].find_all('td')) == 1:
+                                break
+                            elif len(trs[i+j].find_all('td')) >= td_num-1:
+                                del_trs.append(trs[i+j])
+                            else:
+                                break
+                        i += j
+            i += 1
+        for tr in del_trs:
+            del_tag = tr.extract()
+            # print('删除表格业绩内容', del_tag.text)
+
 def recall(y_true, y_pred):
     '''
     计算召回率

+ 8 - 77
BiddingKG/dl/interface/Preprocessing.py

@@ -952,7 +952,7 @@ def tableToText(soup, docid=None):
             count_flag = True
             for width_index in range(width):
                 if inner_table[height][width_index][1]==0:
-                    if re.search(company_pattern,inner_table[height][width_index][0])  is not None:
+                    if re.search(company_pattern,inner_table[height][width_index][0]) is not None:
                         count_set.add(inner_table[height][width_index][0])
                     else:
                         count_flag = False
@@ -1082,7 +1082,7 @@ def tableToText(soup, docid=None):
 
                                 cell = table_occurence[i][j]
                                 head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
-                                if re.search("单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人", head):
+                                if re.search("[单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人|[利费]率|负责人|工期|服务(期限?|年限|时间|日期|周期)|(履约|履行)期限|合同(期限?|(完成|截止)(日期|时间))", head):
                                     head = cell["left_head"] + head
                                 else:
                                     head += cell["left_head"]
@@ -1127,7 +1127,7 @@ def tableToText(soup, docid=None):
 
                                 cell = table_occurence[i][j]
                                 head = (cell["left_head"]+"") if len(cell["left_head"])>0 else ""
-                                if re.search("单报标限总]价|金额|成交报?价|报价", head):
+                                if re.search("[单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人|[利费]率|负责人|工期|服务(期限?|年限|时间|日期|周期)|(履约|履行)期限|合同(期限?|(完成|截止)(日期|时间))", head):
                                     head = cell["top_head"] + head
                                 else:
                                     head += cell["top_head"]
@@ -2085,6 +2085,8 @@ def segment(soup,final=True):
             child.insert_after("。")
         if child.name in commaList:
             child.insert_after(",")
+            if child.name != "td" and re.match('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割
+                child.insert_before("。")
         # if child.name == 'div' and 'class' in child.attrs:
         #     # 添加附件"attachment"标识
         #     if "richTextFetch" in child['class']:
@@ -2822,79 +2824,6 @@ def del_achievement(text):
         text = text.replace(rs.group(0), '')
     return text
 
-def del_tabel_achievement(soup):
-    if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
-        return None
-    p1 = '(中标|成交)(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
-    '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除'''
-    for tag in soup.find_all('table'):
-        pre_text = tag.findPreviousSibling().text.strip() if tag.findPreviousSibling() != None else ""
-        tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else ""
-        #     print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text))
-        if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
-            _count = 0
-            for td in tag.find('tr').find_all('td'):
-                td_text = td.text.strip()
-                if len(td_text) > 25:
-                    break
-                if len(td_text) < 25 and re.search('中标候选人|第[一二三四五1-5]候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
-                    _count += 1
-                if _count >=2:
-                    pre_tag = tag.findPreviousSibling().extract()
-                    del_tag = tag.extract()
-                    # print('删除表格业绩内容', pre_tag.text + del_tag.text)
-                    break
-        elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100:
-            del_tag = tag.extract()
-            # print('删除表格业绩内容', del_tag.text)
-    del_trs = []
-    '''删除表格某些行公布的业绩信息'''
-    for tag in soup.find_all('table'):
-        text = tag.text
-        if re.search('业绩', text) == None:
-            continue
-        # for tr in tag.find_all('tr'):
-        trs = tag.find_all('tr')
-        i = 0
-        while i < len(trs):
-            tr = trs[i]
-            if len(tr.find_all('td'))==2 and tr.td!=None and tr.td.findNextSibling()!=None:
-                td1_text =tr.td.text
-                td2_text =tr.td.findNextSibling().text
-                if re.search('业绩', td1_text)!=None and len(td1_text)<10 and len(re.findall('(\d、|(\d))?[-\w()、]+(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)', td2_text))>=2:
-                    # del_tag = tr.extract()
-                    # print('删除表格业绩内容', del_tag.text)
-                    del_trs.append(tr)
-            elif tr.td != None and re.search('^业绩|业绩$', tr.td.text.strip()) and len(tr.td.text.strip())<25:
-                rows = tr.td.attrs.get('rowspan', '')
-                cols = tr.td.attrs.get('colspan', '')
-                if rows.isdigit() and int(rows)>2:
-                    for j in range(int(rows)):
-                        if i+j < len(trs):
-                            del_trs.append(trs[i+j])
-                    i += j
-                elif cols.isdigit() and int(cols)>3 and len(tr.find_all('td'))==1 and i+2 < len(trs):
-                    next_tr_cols = 0
-                    td_num = 0
-                    for td in trs[i+1].find_all('td'):
-                        td_num += 1
-                        if td.attrs.get('colspan', '').isdigit():
-                            next_tr_cols += int(td.attrs.get('colspan', ''))
-                    if next_tr_cols == int(cols):
-                        del_trs.append(tr)
-                        for j in range(1,len(trs)-i):
-                            if len(trs[i+j].find_all('td')) == 1:
-                                break
-                            elif len(trs[i+j].find_all('td')) >= td_num-1:
-                                del_trs.append(trs[i+j])
-                            else:
-                                break
-                        i += j
-            i += 1
-        for tr in del_trs:
-            del_tag = tr.extract()
-            # print('删除表格业绩内容', del_tag.text)
-
 def split_header(soup):
     '''
     处理 空格分割多个表头的情况 : 主要标的名称      规格型号(或服务要求)      主要标的数量      主要标的单价      合同金额(万元)
@@ -2988,7 +2917,6 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = tableToText(article_processed)
         # print(article_processed)
         article_processed = segment(article_processed)
-        # print(article_processed)
 
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号
         # article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号
@@ -3031,6 +2959,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         idx = article_processed.find('供应商报名、缴纳保证金、下载采购文件流程.docx。##attachment##。') # 修复404230599 E交易站源批量附件中标人错误
         if idx > 1000:
             article_processed = article_processed[:idx]
+        for it in re.finditer('[一二三四五六七八九十\d]、中标候选人名称,', article_processed): # 修复大纲类标点导致提取不到,例:515521734
+            article_processed = re.sub(it.group(0), it.group(0)[:-1]+':', article_processed)
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -3195,6 +3125,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
             article.content = "".join(sentences)
             # sentences.append(article_processed[_begin:])
+            article.content = re.sub('[,。\s]+。', '。', article.content) # 处理连续标点
 
             lemmas = []
             doc_offsets = []

+ 39 - 13
BiddingKG/dl/interface/extract.py

@@ -80,6 +80,9 @@ def extractCount(extract_dict,page_attachments,web_source_name):
     dict_pack = _extract.get("prem",{})
     extract_count = 0
     list_code = _extract.get("code",[])
+    word_count = _extract.get("word_count",{})
+    if word_count.get("正文",0)>500:
+        extract_count += 3
     if len(list_code)>0:
         project_code = list_code[0]
     else:
@@ -102,10 +105,12 @@ def extractCount(extract_dict,page_attachments,web_source_name):
                 if _role[0]=="tenderee":
                     tenderee = _role[1]
                 if _role[0]=="win_tenderer":
+                    if _role[1] is not None and _role[1]!="":
+                        extract_count += 2
                     if  win_tenderer=="":
                         win_tenderer = _role[1]
                     if _role[2]!='' and float(_role[2])>0:
-                        extract_count += 1
+                        extract_count += 2
                         if win_bid_price=="":
                             win_bid_price = str(float(_role[2]))
                 if _role[0]=="agency":
@@ -118,15 +123,18 @@ def extractCount(extract_dict,page_attachments,web_source_name):
                 if _role.get("role_name")=="tenderee":
                     tenderee = _role["role_text"]
                 if _role.get("role_name")=="win_tenderer":
+                    if _role["role_text"] is not None and _role["role_text"]!="":
+                        extract_count += 2
                     if  win_tenderer=="":
                         win_tenderer = _role["role_text"]
                     if "role_money" in _role:
                         if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
-                            extract_count += 1
+                            extract_count += 2
                             if win_bid_price=="":
                                 win_bid_price = str(float(_role["role_money"]["money"]))
                 if _role["role_name"]=="agency":
                     agency = _role["role_text"]
+
                 linklist = _role.get("linklist",[])
                 for link in linklist:
                     for l in link:
@@ -152,10 +160,12 @@ def extractCount(extract_dict,page_attachments,web_source_name):
                         has_zhaobiao = True
                     if str(classification)=='采购清单':
                         has_qingdan = True
+
+                extract_count += 2
             if has_zhaobiao:
-                extract_count += 3
-            if has_qingdan:
                 extract_count += 2
+            if has_qingdan:
+                extract_count += 1
         except Exception as e:
             traceback.print_exc()
             pass
@@ -173,7 +183,7 @@ def extractCount(extract_dict,page_attachments,web_source_name):
             extract_count += 1
 
     if web_source_name in set_login_web:
-        extract_count -= 1
+        extract_count -= 3
 
     return extract_count
 
@@ -253,10 +263,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''大纲提取及大纲内容相关提取'''
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
-    requirement_text, aptitude_text, addr_bidopen_text = extract_parameters(parse_document, list_articles[0].content)
+    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines= extract_parameters(parse_document, list_articles[0].content)
     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
-        requirement_text, aptitude_text, addr_bidopen_text = extract_parameters(parse_document, list_articles[0].content)
+        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines = extract_parameters(parse_document, list_articles[0].content)
 
     # 过滤掉Redis里值为0的错误实体
     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
@@ -345,14 +355,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     if original_docchannel != 302:  # 审批项目不做下面提取
         '''表格要素提取'''
-        table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise, web_source_name)
+        table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise+nlp_enterprise_attachment, web_source_name)
         # print('表格提取中标人:', table_prem)
         # print('原提取角色:', prem[0]['prem'])
         if table_prem:
             getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem, in_attachment=in_attachment)
 
     '''候选人提取'''
-    candidate_top3_prem, candidate_dic, in_attachment = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
+    candidate_top3_prem, candidate_dic, in_attachment = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise+nlp_enterprise_attachment)
     # print('表格提取候选人:', candidate_top3_prem)
     getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem, in_attachment=in_attachment)
 
@@ -382,7 +392,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
     fail = channel_dic['docchannel']['docchannel'] == "废标公告"
-    fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
+    fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail,out_lines=out_lines) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
     log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
@@ -432,7 +442,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-07-26'}
+    version_date = {'version_date': '2024-08-20'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -452,6 +462,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
             if len(data_res['prem']['Project']['roleList']) == 0 and data_res['prem']['Project'].get('tendereeMoney', 0) in [0, '0']: # 删除空包
                 data_res['prem'].pop('Project')
 
+    # 把产品属性里面的产品补充到产品列表
+    for d in data_res['product_attrs']['data']:
+        if isinstance(d['product'], str) and d['product'] not in data_res['product']:
+            data_res['product'].append(d['product'])
 
     '''最终检查修正招标、中标金额'''
     getAttributes.limit_maximum_amount(data_res, list_entitys[0])
@@ -482,6 +496,18 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     data_res['label_dic'] = label_dic
     # 开标地点
     data_res['addr_dic'] = {'addr_bidopen': addr_bidopen_text}
+    # 投标地址
+    data_res['addr_dic']['addr_bidsend'] = addr_bidsend_text
+    # 字数
+    if '##attachment##' in list_articles[0].content:
+        text_main, text_attn = list_articles[0].content.split('##attachment##')
+    else:
+        text_main = list_articles[0].content
+        text_attn = ""
+    data_res['word_count'] = {'正文': len(text_main), '附件': len(text_attn)}
+    # 限制产品数量
+    data_res['product'] = data_res['product'][:500]
+    data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]
 
     # for _article in list_articles:
     #         log(_article.content)
@@ -496,7 +522,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     return _extract_json#, list_articles[0].content, get_ent_context(list_sentences, list_entitys)
 
 
-def test(name,content):
+def test1(name,content):
     user = {
         "content": content,
         "id":name
@@ -550,7 +576,7 @@ if __name__=="__main__":
     #     print(rs['product_attrs'])
     # print(rs)
 
-    with open('D:/html/2.html', 'r', encoding='utf-8') as f:
+    with open('2.html', 'r', encoding='utf-8') as f:
         text = f.read()
         t1 = time.time()
         print(predict('', text, title))

+ 171 - 80
BiddingKG/dl/interface/getAttributes.py

@@ -550,15 +550,15 @@ def getRoleList(list_sentence,list_entity,on_value = 0.5):
             elif str(label) in ["2"] and entity_prob > 0.8:
                 win_tenderer_set.add(entity_text)
 
-            if len(list_real_comba) > 1 and label == '2':
-                multi_winner = []
-                for comba in list_real_comba:
-                    tmp_ent = comba.get(_key, '')
-                    tmp_prob = dict_pack_entity_prob.get(_key+'$text$'+tmp_ent, ['',0])[1]
-                    if tmp_ent !='' and tmp_prob>0.8:
-                        multi_winner.append(comba[_key])
-                if len(set(multi_winner)) > 1:
-                    RoleList[-1].multi_winner = multi_winner
+            # if len(list_real_comba) > 1 and label == '2':  # 20240809 由于包号对应不上注销
+            #     multi_winner = []
+            #     for comba in list_real_comba:
+            #         tmp_ent = comba.get(_key, '')
+            #         tmp_prob = dict_pack_entity_prob.get(_key+'$text$'+tmp_ent, ['',0])[1]
+            #         if tmp_ent !='' and tmp_prob>0.8:
+            #             multi_winner.append(comba[_key])
+            #     if len(set(multi_winner)) > 1:
+            #         RoleList[-1].multi_winner = multi_winner
             # print('RoleList: ', RoleList)
             RoleSet.add(entity_text)
 
@@ -944,7 +944,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
         for i in range(len(packDict[packageName]["roleList"])):
             if packDict[packageName]["roleList"][i].entity_text==entity:
-                packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text
+                # packDict[packageName]["roleList"][i].serviceTime = serviceTime.entity_text
+                packDict[packageName]["roleList"][i].serviceTime = extract_serviceTime(serviceTime.entity_text,"")
 
     #根据实体名称得到角色
     def getRoleWithText(packDict,entity_text):
@@ -3767,9 +3768,9 @@ def get_days_between(day1,day2,get_abs=0):
         return days_difference
 
 def extract_serviceTime(service_time,page_time):
-    pattern1 = re.compile("\d{4}[年\-\./]\d{1,2}[月\-\./]\d{1,2}日?")
-    pattern2 = re.compile("\d+(?:\.\d+)?[\((]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])")
-    pattern3 = re.compile("\d{4}[年\-\./]\d{1,2}月?")
+    pattern1 = re.compile("\d{4}[年\-./]\d{1,2}[月\-./]\d{1,2}日?")
+    pattern2 = re.compile("\d+(?:\.\d+)?[((]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])")
+    pattern3 = re.compile("\d{4}[年\-./]\d{1,2}月?")
     pattern4 = re.compile("(?:日|天|周年|年|月|周|日历[天日]|工作[天日]|星期)[^\d]{1,3}\d+(?:\.\d+)?")
     DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
                  "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9,
@@ -3829,7 +3830,7 @@ def extract_serviceTime(service_time,page_time):
 
         return sum(result_list) + result
 
-    serviceTime_dict = {"service_start": "", "service_end": "", "service_days": ""}
+    serviceTime_dict = {"service_start": "", "service_end": "", "service_days": 0}
     re_num = re.findall(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+',service_time)
     for _num in re_num:
         if not re.search("[十拾百佰千仟]",_num):
@@ -3850,7 +3851,7 @@ def extract_serviceTime(service_time,page_time):
         time_list = []
         for _time in re.findall(pattern1,service_time):
             _time = re.sub("日","",_time)
-            _time = re.sub("[年月\./]","-",_time)
+            _time = re.sub("[年月./]","-",_time)
             _year,_month,_day = _time.split("-")
             _month = int(_month)
             _day = int(_day)
@@ -3866,7 +3867,7 @@ def extract_serviceTime(service_time,page_time):
             if get_days_between(page_time,time_list[1])>1 and get_days_between(time_list[0],time_list[1])>0:
                 serviceTime_dict['service_end'] = time_list[1]
                 serviceTime_dict['service_start'] = time_list[0]
-        else:
+        elif len(time_list)==1:
             if get_days_between(page_time, time_list[0]) > 1:
                 serviceTime_dict['service_end'] = time_list[0]
             # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
@@ -3875,7 +3876,7 @@ def extract_serviceTime(service_time,page_time):
         # end_time = re.findall(pattern3,service_time)[-1]
         for _time in re.findall(pattern3,service_time):
             _time = re.sub("月","",_time)
-            _time = re.sub("[年\./]","-",_time)
+            _time = re.sub("[年./]","-",_time)
             _year,_month = _time.split("-")
             _day = 0
             _month = int(_month)
@@ -3892,7 +3893,7 @@ def extract_serviceTime(service_time,page_time):
             if get_days_between(page_time, time_list[1]) > 1 and get_days_between(time_list[0], time_list[1]) > 0:
                 serviceTime_dict['service_end'] = time_list[1]
                 serviceTime_dict['service_start'] = time_list[0]
-        else:
+        elif len(time_list)==1:
             if get_days_between(page_time, time_list[0]) > 1:
                 serviceTime_dict['service_end'] = time_list[0]
                 # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
@@ -3921,24 +3922,33 @@ def extract_serviceTime(service_time,page_time):
                 elif unit==1:
                     if match_num>4000:#单位为'日'时,排除数字过大的
                         match_num = 0
-                service_days = match_num * unit
-                if int(service_days) % 360==0:
+                service_days = int(match_num * unit)
+                if service_days % 360==0:
                     service_days = service_days / 360 * 365
+                elif service_days % 180==0 and service_days % 360!=0:
+                    service_days = service_days // 360 * 365 + 180
                 service_days = int(service_days)
                 if service_days <= 1 and service_days > 4000:
                     service_days = 0
 
-                if service_days>0:
-                    service_days = str(service_days) + "天"
+                if service_days>3:
+                    # service_days = str(service_days) + "天"
                     serviceTime_dict['service_days'] = service_days
                     break
     elif "半年" in service_time:
         service_days = 180
-        service_days = str(service_days) + "天"
+        # service_days = str(service_days) + "天"
+        serviceTime_dict['service_days'] = service_days
+    if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
+        service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
         serviceTime_dict['service_days'] = service_days
 
     return serviceTime_dict
 
+def getServiceTime():
+
+    pass
+
 def getOtherAttributes(list_entity,page_time,prem,channel_dic):
     dict_other = {"moneysource":"",
                   "person_review":[],
@@ -3976,7 +3986,7 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
 
     time_contractEnd = prem[0].get("time_contractEnd","")[:10]
     time_contractStart = prem[0].get("time_contractStart","")[:10]
-    serviceTime_dict = {"service_start":"", "service_end":"", "service_days": ""}
+    serviceTime_dict = {"service_start":"", "service_end":"", "service_days": 0}
     if time_contractEnd:
         serviceTime_dict['service_end'] = time_contractEnd
         if time_contractStart:
@@ -3986,12 +3996,10 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
     if list_serviceTime and not serviceTime_dict['service_end']:
         list_serviceTime_inAtt = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==1]
         list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0]
-        # if not list_serviceTime:
-        #     list_serviceTime = list_serviceTime_inAtt
         error_serviceTime = []
         for list_time in [list_serviceTime,list_serviceTime_inAtt]:
             # if not dict_other["serviceTime"]:
-            if not serviceTime_dict['service_end']:
+            if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
                 list_time.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True)
                 for _serviceTime in list_time:
                     # 优先取具体时间(20XX年x月x日-20XX年x月x日)
@@ -4040,7 +4048,7 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
                                 break
     if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
         service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
-        serviceTime_dict['service_days'] = str(service_days) + "天"
+        serviceTime_dict['service_days'] = service_days
     dict_other["serviceTime"] = serviceTime_dict
     if not time_contractEnd and channel_dic['docchannel']['docchannel']=='合同公告': # 用serviceTime补充合同开始结束时间,公告类型为合同公告
         if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
@@ -4189,7 +4197,10 @@ def limit_maximum_amount(dic, list_entity):
     for value in dic['prem'].values():
         for l in value['roleList']:
             if l["role_name"] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
-                date = float(re.search('(\d+)天', l.get('serviceTime', '')).group(1)) if re.search('(\d+)天', l.get('serviceTime', '')) else 0
+                # date = float(re.search('(\d+)天', l.get('serviceTime', '')).group(1)) if re.search('(\d+)天', l.get('serviceTime', '')) else 0
+                serviceTime_dict = l.get('serviceTime', dict())
+                serviceTime_dict = serviceTime_dict if serviceTime_dict else dict()
+                date = serviceTime_dict.get("service_days",0)
                 if 0 < date < 180 and float(l["role_money"]['money']) > 10000000000: # 工期小于180天且金额大于百亿的,错误
                     l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
                     # print('工期纠正百亿以上金额 ')
@@ -4285,7 +4296,7 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
     :return:
     '''
     try:
-        if 'win_tenderer' in str(prem) and re.search('联合(体|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]', list_articles[0].content):
+        if 'win_tenderer' in str(prem) and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|[,;]成:|(成[),]|与[^,。]{6,100}联合体', list_articles[0].content):
             sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
             for project in prem[0].values():
                 if not isinstance(project, dict):
@@ -4303,20 +4314,28 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
                                     if _entity.entity_type in ['org', 'company'] and _entity.label==2\
                                             and _entity.entity_text==winner:
                                         s = sentences[_entity.sentence_index].sentence_text
+                                        find_joint = 0 # 是否包含联合体
                                         for j in range(i+1, len(list_entity)):
                                             behind_entity = list_entity[j]
                                             b2 = behind_entity.wordOffset_begin
                                             e2 = behind_entity.wordOffset_end
                                             if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
-                                                    and b2-e<10 and re.search('联合(体|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[b2-e:b2]) or \
-                                                re.search('(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
+                                                    and b2-e<13 and re.search('联合(体|方|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[e:b2]) or \
+                                                re.search('(联合(体|方|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
+                                                join_l.append(behind_entity.entity_text)
+                                                b = b2
+                                                e = e2
+                                                find_joint = 1
+                                            elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。'] or s[e2:e2+3]=='联合体'):
                                                 join_l.append(behind_entity.entity_text)
                                                 b = b2
                                                 e = e2
+                                            elif e == e2: # 修复重复实体导致中断情况
+                                                continue
                                             else:
                                                 break
                                         if len(join_l)>1:
-                                            d['win_tenderer_joint'] = ''.join(set(join_l))
+                                            d['win_tenderer_joint'] = ','.join(set(join_l))
 
 
 
@@ -4348,17 +4367,70 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
     :param list_sentences:
     :return:
     '''
+
+    def add_multi_winner(pack_l, winner_l):
+        if len(prem[0]) > 1 and len(set([it[0] for it in pack_l])) > 1:  # 多标段多中标人处理
+            pk_dic = {}
+            for ent in winner_l:
+                for i in range(len(pack_l)):
+                    pk, s1, b1, _ = pack_l[i]
+                    if ent[1] < s1 or ent[1] == s1 and ent[2] < b1:
+                        break
+                    elif (ent[1] > s1 or ent[1] == s1 and ent[2] > b1):
+                        if i < len(pack_l) - 1:
+                            pk2, s2, b2, _ = pack_l[i + 1]
+                            if (ent[1] < s2 or ent[1] == s2 and ent[2] < b2):
+                                if pk not in pk_dic:
+                                    pk_dic[pk] = set()
+                                pk_dic[pk].add(ent[0])
+                            else:
+                                continue
+                        else:
+                            if pk not in pk_dic:
+                                pk_dic[pk] = set()
+                            pk_dic[pk].add(ent[0])
+                    else:
+                        continue
+            for pk, multi_winner in pk_dic.items():
+                multi_winner = multi_winner - tenderee_or_agency
+                if len(multi_winner) < 2:
+                    continue
+                for project in prem[0].values():
+                    if not isinstance(project, dict):
+                        continue
+                    for k, v in project.items():
+                        if pk == k:
+                            for d in v['roleList']:
+                                if d.get('role_name', '') == 'win_tenderer':
+                                    if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
+                                        d['multi_winner'] = ','.join(set(multi_winner))
+        else:
+            multi_winner = set([it[0] for it in winner_l]) - tenderee_or_agency
+            if len(multi_winner) > 1:
+                for project in prem[0].values():
+                    if not isinstance(project, dict):
+                        continue
+                    for v in project.values():
+                        for d in v['roleList']:
+                            if d.get('role_name', '') == 'win_tenderer':
+                                if d.get('role_text', '') in multi_winner and 'multi_winner' not in d:
+                                    d['multi_winner'] = ','.join(set(multi_winner))
+                                break
+
     moneys = []
     moneys_attachment = []
     if channel_dic['docchannel']['docchannel'] in ['中标信息','候选人公示','合同公告'] and 'win_tenderer' in str(prem):
         sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
+        entitys = sorted(list_entitys[0], key=lambda x: x.sentence_index)
         finalists = [] # 入围供应商
+        multi_winner_l = [] # 保存中标人名称列表
+        tenderee_or_agency = set()
+        package_l = []
         i = 0
-        while i < len(list_entitys[0])-1:
-            ent = list_entitys[0][i]
+        while i < len(entitys)-1:
+            ent = entitys[i]
             b_idx_fr = ent.wordOffset_begin
             e_idx_fr = ent.wordOffset_end
-            multi_winner_l = []
             i += 1
             if ent.entity_type in ['money']:
                 money = float(ent.entity_text)
@@ -4366,53 +4438,57 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                     moneys_attachment.append(money)
                 else:
                     moneys.append(money)
-            if ent.entity_type in ['org', 'company'] and ent.label == 2 and ent.values[ent.label]>0.8:
-                if ent.entity_text not in multi_winner_l:
-                    multi_winner_l.append(ent.entity_text)
+            elif ent.entity_type in ['package']:
+                package_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
+            elif ent.entity_type in ['org', 'company'] and ent.label in [0,1] and ent.values[ent.label] > 0.8:
+                tenderee_or_agency.add(ent.entity_text)
+            elif ent.entity_type in ['org', 'company'] and ent.label == 2:
                 sentence_text = sentences[ent.sentence_index].sentence_text
-                pre_text = sentence_text[max(0, b_idx_fr-10):b_idx_fr]
-                if re.search('入围', pre_text) and re.search('未入围', pre_text)==None and ent.entity_text not in finalists:
-                    finalists.append(ent.entity_text)
-                for j in range(i, len(list_entitys[0])):
-                    ent_bh = list_entitys[0][j]
-                    b_idx_bh = ent_bh.wordOffset_begin
-                    e_idx_bh = ent_bh.wordOffset_end
-                    if ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh-e_idx_fr==1:
-                        sentence_text = sentences[ent_bh.sentence_index].sentence_text
-                        if sentence_text[e_idx_fr:b_idx_bh] in [';','、','&',','] and (len(sentence_text)==e_idx_bh or sentence_text[e_idx_bh] in [';','、','&', ',', '。']): # 修复多中标人刚好在文末index超出报错,例子 407126558
-                            if ent_bh.entity_text not in multi_winner_l:
-                                multi_winner_l.append(ent_bh.entity_text)
+                pre_text = sentence_text[max(0, b_idx_fr - 10):b_idx_fr]
+                if ent.values[ent.label] > 0.8:
+                    multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
+                    for j in range(i, len(list_entitys[0])):
+                        ent_bh = list_entitys[0][j]
+                        b_idx_bh = ent_bh.wordOffset_begin
+                        e_idx_bh = ent_bh.wordOffset_end
+                        if ent_bh.entity_type in ['org','company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
+                            sentence_text = sentences[ent_bh.sentence_index].sentence_text
+                            if sentence_text[e_idx_fr:b_idx_bh] in [';', '、', '&', ',', '/', '//'] and (
+                                    len(sentence_text) == e_idx_bh or sentence_text[e_idx_bh] in [';', '、', '&', ',','/', '//','。']):  # 修复多中标人刚好在文末index超出报错,例子 407126558
+                                multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
+                                e_idx_fr = e_idx_bh
+                                i = j + 1
+                            else:
+                                break
+                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr:
+                            multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
                             e_idx_fr = e_idx_bh
                             i = j + 1
+                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and e_idx_fr == e_idx_bh: # 处理 514603520 中国邮政储蓄银行股份有限公司淄博市临淄区支行 实体由于字典匹配重复两次情况
+                            i = j + 1
                         else:
                             break
-                    elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh==e_idx_fr:
-                        if ent_bh.entity_text not in multi_winner_l:
-                            multi_winner_l.append(ent_bh.entity_text)
-                        e_idx_fr = e_idx_bh
-                        i = j + 1
-                    else:
-                        break
-            if len(multi_winner_l)>=2:
-                for project in prem[0].values():
-                    if not isinstance(project, dict):
-                        continue
-                    for v in project.values():
-                        for d in v['roleList']:
-                            if d.get('role_name', '') == 'win_tenderer' and d.get('role_text', '') == multi_winner_l[0]:
-                                # d['multi_winner'] = ','.join(set(multi_winner_l))
-                                d['multi_winner'] = ','.join(multi_winner_l)
-                                break
-        if len(finalists)>=2:
-            for project in prem[0].values():
-                if not isinstance(project, dict):
-                    continue
-                for v in project.values():
-                    for d in v['roleList']:
-                        if d.get('role_name', '') == 'win_tenderer':
-                            winner = d.get('role_text')
-                            if winner in finalists:
-                                d['multi_winner'] = ','.join(finalists)
+                    if re.search('入围', pre_text) and re.search('未入围', pre_text)==None:
+                        finalists.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
+
+        if len(multi_winner_l)>=2:
+            winner_main = [it for it in multi_winner_l if not it[3]]
+            winner_attn = [it for it in multi_winner_l if it[3]]
+            pack_main = [it for it in package_l if not it[3]]
+            pack_attn = [it for it in package_l if it[3]]
+            if len(set([it[0] for it in winner_main]))>=2: # 有两个及以上多中标人及多标段 例:441612746
+                add_multi_winner(pack_main, winner_main)
+            elif len(set([it[0] for it in winner_attn]))>=2:
+                add_multi_winner(pack_attn, winner_attn)
+        if len(finalists)>=2: # 多入围候选人
+            winner_main = [it for it in finalists if not it[3]]
+            winner_attn = [it for it in finalists if it[3]]
+            pack_main = [it for it in package_l if not it[3]]
+            pack_attn = [it for it in package_l if it[3]]
+            if len(set([it[0] for it in winner_main]))>=2: # 有两个及以上多中标人及多标段 例:276326152
+                add_multi_winner(pack_main, winner_main)
+            elif len(set([it[0] for it in winner_attn]))>=2:
+                add_multi_winner(pack_attn, winner_attn)
     else:
         for i in range(len(list_entitys[0])):
             ent = list_entitys[0][i]
@@ -4433,7 +4509,7 @@ def update_prem(old_prem, new_prem, in_attachment=False):
     '''
     if len(new_prem) >= 1 :
         '''如果表格提取的包大于2,原来的包比表格提取的包多则删除原来多余的包,以表格的为准'''
-        if len(new_prem) >= 2 and len(new_prem)<len(old_prem) <= len(new_prem)*2:
+        if len(new_prem) >= 2 and (len(new_prem)<len(old_prem) <= len(new_prem)*2 or set(old_prem)&set(new_prem)==set()): # 修复类似443925411 标的+标包才算标段号
             del_k = []
             for k in old_prem:
                 if k not in new_prem and k != 'Project':
@@ -4441,7 +4517,7 @@ def update_prem(old_prem, new_prem, in_attachment=False):
             for k in del_k:
                 old_prem.pop(k)
 
-        if len(old_prem) > len(new_prem) and in_attachment==False: # 如果表格有提取,非表格包数比表格提取多,去掉非表格在附件里提取的包
+        if len(old_prem) > len(new_prem) and len(new_prem)>1 and in_attachment==False: # 如果表格有提取,非表格包数比表格提取多,去掉非表格在附件里提取的包
             del_k = []
             for k in old_prem:
                 if 'in_attachment' in old_prem[k] and old_prem[k]['in_attachment'] and k not in new_prem and k != 'Project':
@@ -4479,6 +4555,10 @@ def update_prem(old_prem, new_prem, in_attachment=False):
                                 if float(d2['role_money']['money']) != 0:  # 如果表格提取的金额不为0才替换
                                     d['role_money']['money'] = d2['role_money']['money']
                                     d['role_money']['money_unit'] = d2['role_money']['money_unit']
+                                for k in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
+                                    if d2[k]:
+                                        d[k] = d2[k]
+
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem['Project']['roleList'].append(d2)
@@ -4508,6 +4588,9 @@ def update_prem(old_prem, new_prem, in_attachment=False):
                                 if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
                                     d['role_money']['money'] = d2['role_money']['money']
                                     d['role_money']['money_unit'] = d2['role_money']['money_unit']
+                                for k in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
+                                    if d2[k]:
+                                        d[k] = d2[k]
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem[k]['roleList'].append(d2)
@@ -4536,8 +4619,16 @@ def  confirm_prem(prem, channel_dic):
                 if d['role_name'] in ['win_tenderer', 'pre_win_tenderer', 'second_tenderer','third_tenderer']:
                     if k == 'Project':
                         pro_winner.add(d['role_text'])
+                        if 'win_tenderer_joint' in d:
+                            pro_winner.update(set(d['win_tenderer_joint'].split(',')))
+                        if 'multi_winner' in d:
+                            pro_winner.update(set(d['multi_winner'].split(',')))
                     else:
                         other_winner.add(d['role_text'])
+                        if 'win_tenderer_joint' in d:
+                            other_winner.update(set(d['win_tenderer_joint'].split(',')))
+                        if 'multi_winner' in d:
+                            other_winner.update(set(d['multi_winner'].split(',')))
         if pro_winner & other_winner != set():
             prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
                                                d['role_name'] not in ['win_tenderer', 'second_tenderer',

+ 66 - 51
BiddingKG/dl/interface/get_label_dic.py

@@ -6,14 +6,8 @@
 @time: 2024/7/23 14:45
 """
 
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""
-@author: bidikeji
-@time: 2024/7/11 17:56
-"""
 from BiddingKG.dl.common.Utils import getUnifyMoney
+import math
 import re
 
 def chinese_to_arabic(s):
@@ -59,34 +53,37 @@ def chinese_to_arabic(s):
 def get_all_label(title, content):
     def is_direct_procurement():
         # 企业直采
-        if re.search('询比价|询比|竞价|竞价|议价|报价', title) or re.search('我要报价|竞价起止时间|报价起止时间', content) or \
+        if re.search('询比价|询比|竞价|议价|报价', title) or re.search('我要报价|竞价起止时间|报价起止时间', content) or \
                 (re.search('公司|集团|企业', content) and re.search('招标|中标|投标', content) == None):
             return 1
         return 0
 
     def is_target_small():
         # 专门面向中小企业
-        if re.search('专门面向中小微?企业', content) and re.search('(非|不属于|不|是/否))?专门面向中小微?企业|部分面向中小微?企业', content) == None:
+        if re.search('专门面向中小微?企业', content):
+            if re.search('(非|不属于|不|是/否))?专门面向(中小微?企业)?|部分面向中小微?企业|专门面向中小企业采购:否', content):
+                return 0
             return 1
         elif re.search('仅面向小微企业|专门面向.{,30}中小企业采购|是否专门面向中小微?企业(采购)?:是|本项目为中小型企业预留项目|专门面向中小微?企业', content):
             return 1
-        elif re.search('落实政府采购政策需满足的资格要求.{,30}供应商为中小企业', content) and re.search('(非|不属于|不|是/否))?专门面向中小微?企业|部分面向中小微?企业',
-                                                                                content) == None:
+        elif re.search('落实政府采购政策需满足的资格要求.{,30}供应商为中小企业', content):
+            if re.search('(非|不属于|不|是/否))?专门面向中小微?企业|部分面向中小微?企业',content):
+                return 0
             return 1
         return 0
 
     def registered_years():
         # 注册年限
         ser = None
-        if re.search('禁止\w{,5}注册未满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content):
-            ser = re.search('禁止\w{,5}注册未满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content)
-        elif re.search('(成立|注册)时间:?\w{,10}(不[低少]于|大于(等于)?|需满)(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content):
-            ser = re.search('(成立|注册)时间:?\w{,10}(不[低少]于|大于(等于)?|需满)(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))',
+        if re.search('禁止.{,5}注册未满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content):
+            ser = re.search('禁止.{,5}注册未满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content)
+        elif re.search('(成立|注册)时间:?.{,10}(不[低少]于|大于(等于)?|需满)(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))', content):
+            ser = re.search('(成立|注册)时间:?.{,10}(不[低少]于|大于(等于)?|需满)(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月))',
                             content)
-        elif re.search('(成立|注册)时间:?\w{,10}(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)[或及]?以上)', content):
-            ser = re.search('(成立|注册)时间:?\w{,10}(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)[或及]?以上)', content)
-        elif re.search('(成立|注册)时间:?\w{,10}不满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)\w{,5}请勿报价)', content):
-            ser = re.search('(成立|注册)时间:?\w{,10}不满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)\w{,5}请勿报价)', content)
+        elif re.search('(成立|注册)时间:?.{,10}(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)[或及]?以上)', content):
+            ser = re.search('(成立|注册)时间:?.{,10}(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月)[或及]?以上)', content)
+        elif re.search('(成立|注册)时间:?.{,10}不满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月).{,5}请勿报价)', content):
+            ser = re.search('(成立|注册)时间:?.{,10}不满(?P<num>([一二三四五六七八九十]+|\d+))(?P<unit>(年|个?月).{,5}请勿报价)', content)
         if ser:
             num = ser.group('num')
             unit = ser.group('unit')
@@ -94,18 +91,18 @@ def get_all_label(title, content):
                 num = int(num)
             else:
                 num = chinese_to_arabic(num)
-            if unit == '年':
-                num *= 12
+            if '月' in unit: # 向上取整为年
+                num = math.ceil(num/12)
             return num
         return 0
 
     def registered_capital():
         # 注册资本
         ser = None
-        if re.search('注册(资本|资金):?\w{,5}(不[低少]于|大于(等于)?|≥)(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元))', content):
-            ser = re.search('注册(资本|资金):?\w{,5}(不[低少]于|大于(等于)?|≥)(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元))', content)
-        elif re.search('注册(资本|资金):?\w{,5}(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元)[或及]?以上)', content):
-            ser = re.search('注册(资本|资金):?\w{,5}(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元)[或及]?以上)', content)
+        if re.search('注册(资本|资金):?.{,5}(不[低少]于|大于(等于)?|≥)(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元))', content):
+            ser = re.search('注册(资本|资金):?.{,5}(不[低少]于|大于(等于)?|≥)(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元))', content)
+        elif re.search('注册(资本|资金):?.{,5}(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元)[或及]?以上)', content):
+            ser = re.search('注册(资本|资金):?.{,5}(?P<num>(\d+[\d.]*))(?P<unit>([万亿]?元)[或及]?以上)', content)
         if ser:
             num = ser.group('num')
             unit = ser.group('unit')
@@ -116,35 +113,43 @@ def get_all_label(title, content):
         # 有资质证书要求
         if re.search('资质要求.{,150}(行业资质|证书|许可证|认证|经营范围|一级|二级|三级|甲级|乙级|丙级|特级|壹级|贰级|叁级)', content):
             return 1
-        elif re.search('(提供|有|具备)\w{,50}(资质|认证|证书|许可证)', content):
+        elif re.search('(提供|有|具备).{,50}(资质|认证|证书|许可证)', content):
             return 1
-        elif re.search('资格)?要求:?\w{,30}(甲级|丙级|乙级|一级|二级|三级|特级|壹级|贰级|叁级)', content):
+        elif re.search('资格)?要求:?.{,50}(甲级|丙级|乙级|一级|二级|三级|特级|壹级|贰级|叁级)', content):
             return 1
         elif re.search('认证体系要求', content):
             return 1
+        elif re.search('经营范围须包含|经营范围需包含|经营范围需有|经营范围须有|经营范围有|经营范围内含|营业执照范围内包括', content):
+            return 1
         return 0
 
     def need_ca():
         # 7 是否需要办CA
-        if re.search('需要\w{,20}数字证书|使用\w{,20}签章', content):
+        if re.search('需要.{,20}数字证书|使用.{,20}(签章|数字证书|CA|ca)', content):
             return 1
-        elif re.search('办理\w{,20}(数字证书|CA|ca)', content) and re.search('无需\w{,15}办理', content) == None:
+        elif re.search('办理.{,20}(数字证书|CA|ca|密钥|签章)', content):
+            if re.search('无需.{,15}办理', content):
+                return 0
             return 1
-        elif re.search('(数字证书|CA|ca)\w{,5}办理|是否要求供应商使用(CA|ca)数字证书参与:是', content):
+        elif re.search('(数字证书|CA|ca|密钥).{,5}办理|是否要求供应商使用(CA|ca)数字证书参与:是', content):
             return 1
-        if re.search('(不使用|无需)\w{,20}(数字证书|CA|ca)|是否要求供应商使用(CA|ca)数字证书参与:不要求', content):
+        if re.search('(不使用|无需).{,20}(数字证书|CA|ca)|是否要求供应商使用(CA|ca)数字证书参与:不要求', content):
             return 0
         return 0
 
     def need_performance():
         # 有业绩要求
-        if re.search('业绩证明|业绩要求|行业业绩|相关业绩', content):
+        if re.search('业绩证明|业绩要求|行业业绩|相关业绩|同类项目业绩经验|业绩材料', content):
+            if re.search('业绩.{,5}[/无]', content):
+                return 0
             return 1
-        elif re.search('类似\w{,10}业绩', content) or re.search('业绩.{,5}如有', content) == None:
+        elif re.search('类似.{,10}业绩', content):
+            if re.search('业绩.{,5}如有', content):
+                return 0
             return 1
-        elif re.search('完成[^,。]{,100}项目', content):
+        elif re.search('完成[^。]{,100}项目', content):
             return 1
-        elif re.search('(提供|有|完成).{,100}业绩', content):
+        elif re.search('(提供|有|完成|承接|具备|承担)[^。;]{,100}业绩', content):
             return 1
         return 0
 
@@ -204,15 +209,15 @@ def get_all_label(title, content):
                 return 2
         return 0
 
-    def suitable_small():
-        # 适合小微企业投标
-        if re.search('属于专门面向中小企业|有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
-            return 1
-        elif re.search('属于企业直采|有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
-            return 2
-        elif re.search('有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
-            return 3
-        return 0
+    # def suitable_small():
+    #     # 适合小微企业投标
+    #     if re.search('属于专门面向中小企业|有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
+    #         return 1
+    #     elif re.search('属于企业直采|有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
+    #         return 2
+    #     elif re.search('有招标单位联系方式|无注册年限要求|无注册资本要求|无资质证书要求|无业绩要求', content):
+    #         return 3
+    #     return 0
 
     label_dic = {}
     is_direct_procurement = is_direct_procurement() # 是否直接采购
@@ -223,7 +228,7 @@ def get_all_label(title, content):
     need_qualification = need_qualification() # 资质要求
     registered_capital = registered_capital() # 注册资本
     registered_years = registered_years() # 注册年限
-    suitable_small = suitable_small() # 适合小微企业
+    # suitable_small = suitable_small() # 适合小微企业
 
     label_dic['is_direct_procurement'] = is_direct_procurement
     label_dic['is_target_small'] = is_target_small
@@ -233,7 +238,7 @@ def get_all_label(title, content):
     label_dic['need_qualification'] = need_qualification
     label_dic['registered_capital'] = registered_capital
     label_dic['registered_years'] = registered_years
-    label_dic['suitable_small'] = suitable_small
+    # label_dic['suitable_small'] = suitable_small
 
     label_dic = {k: v for k, v in label_dic.items() if v!=0}
 
@@ -249,19 +254,29 @@ if __name__ == "__main__":
     from bs4 import BeautifulSoup
     import json
 
-    df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv')[:]
+    df = pd.read_csv(r'E:\导出数据\2024-03-18入库去重后所有公告_输入要素.csv')[:]
+    df1 = pd.read_csv(r'E:\导出数据\2024-03-18入库去重后所有公告_html.csv')[:]
+    df = df.merge(df1, how='inner', on='docid')
+    df.fillna('', inplace=True)
+    # df = df[:10]
+
     print(df.columns, len(df))
-    df.drop_duplicates(subset=['docchannel', 'web_source_name', 'exist_table'], inplace=True)
-    print(len(df))
+    # df.drop_duplicates(subset=['docchannel', 'web_source_name', 'exist_table'], inplace=True)
+    # print(len(df))
     def get_text(html):
         soup = BeautifulSoup(html, 'lxml')
         text = soup.get_text()
         return text
     df['content'] = df['dochtmlcon'].apply(lambda x: get_text(x))
     df['标签'] = df.apply(lambda x: get_all_label(x['doctitle'], x['content']), axis=1)
+
+    for k in ['is_direct_procurement', 'is_target_small', 'mode_of_partipation', 'need_ca', 'need_performance', 'need_qualification', 'registered_capital', 'registered_years']:
+        df[k] = df['标签'].apply(lambda x: x[k])
+
     df['标签'] = df['标签'].apply(lambda x: json.dumps(x, ensure_ascii=False, indent=2))
-    df = df[['docid', 'docchannel', 'web_source_name', 'exist_table', '标签']]
-    df.to_excel('E:/公告标签提取结果.xlsx', index=False)
+    df.drop_duplicates(subset=['docchannel', 'web_source_no', 'is_direct_procurement', 'is_target_small', 'mode_of_partipation', 'need_ca', 'need_performance', 'need_qualification', 'registered_capital', 'registered_years'], inplace=True)
+    df = df[['docid', 'docchannel', 'web_source_no', '标签', 'is_direct_procurement', 'is_target_small', 'mode_of_partipation', 'need_ca', 'need_performance', 'need_qualification', 'registered_capital', 'registered_years']]
+    df.to_excel('E:/2024-03-18公告标签提取结果.xlsx', index=False)
 
 
 

BIN=BIN
BiddingKG/dl/interface/header_set.pkl


+ 7 - 3
BiddingKG/dl/interface/htmlparser.py

@@ -216,8 +216,8 @@ class ParseDocument():
                 self.soup = _body
             self.list_obj = self.get_soup_objs(self.soup)
 
-            # self.list_obj = [it.get_text().strip().replace(' ', '') for it in self.list_obj]
-            # self.list_obj = [Sentence2(text, 1,1,5) for text in self.list_obj]
+            self.list_obj = [re.sub('\s+', ' ', it.get_text().strip()) for it in self.list_obj]
+            self.list_obj = [Sentence2(text, 1,1,5) for text in self.list_obj]
 
         # for obj in self.list_obj:
         #     print("obj",obj.get_text()[:20])
@@ -286,7 +286,11 @@ class ParseDocument():
         groups = []
         if _se is not None:
             e = _se.end()
-            if re.search('(时间|日期|编号|账号|号码|手机|价格|\w价|人民币|金额|得分|分值|总分|满分|最高得|扣|减)[::]?\d', _se.group(0)) or (re.search('\d[.::]?$', _se.group(0)) and re.search('^[\d年月日万元天]', _text[e:])):
+            if re.search('(时间|日期|编号|账号|号码|手机|价格|\w价|人民币|金额|得分|分值|总分|满分|最高得|扣|减|数量|评委)[::]?\d', _se.group(0)) or (re.search('\d[.::]?$', _se.group(0)) and re.search('^[\d年月日万元天个分秒台条A-Za-z]|^(小时)', _text[e:])):
+                return None
+            elif re.match('[二三四五六七八九十]\w{1,2}[市区县]|五金|四川|八疆|九龙|[一二三四五六七八九十][层天标包]', _text) and re.match('[一二三四五六七八九十]', _se.group(0)): # 289765335 排除三明市等开头作为大纲
+                return None
+            elif re.search('^[\u4e00-\u9fa5]+[::]', _text[:e]):
                 return None
             _gd = _se.groupdict()
             for k,v in _gd.items():

+ 21 - 8
BiddingKG/dl/interface/outline_extractor.py

@@ -27,7 +27,7 @@ def extract_sentence_list(sentence_list):
         sentence_text = sentence.sentence_text
         begin_index = 0
         end_index = 0
-        for it in re.finditer('([^一二三四五六七八九十,。][一二三四五六七八九十]{1,3}|[^\d,。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
+        for it in re.finditer('([^一二三四五六七八九十,。][一二三四五六七八九十]{1,3}|[^\d\.、,。a-zA-Z]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
             temp = it.group(0)
             sentence_text = sentence_text.replace(temp, temp[0] + ',' + temp[1:])
         for item in re.finditer('[,。;;!!?]+', sentence_text): # 20240725去掉英文问号,避免网址被分隔
@@ -35,6 +35,8 @@ def extract_sentence_list(sentence_list):
             # if end_index!=len(sentence_text):
             #     # if end_index-begin_index<6 and item.group(0) in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', sentence_text[begin_index:end_index])==None: # 20240725 注销,避免标题提取错误
             #     #     continue
+            if end_index != len(sentence_text) and re.match('[一二三四五六七八九十\d.]{1,2}[、,.]+$', sentence_text[begin_index:end_index]): # 避免表格序号和内容在不同表格情况 例:293178161
+                continue
             new_sentence_text = sentence_text[begin_index:end_index]
             sentence2 = Sentence2(new_sentence_text,sentence_index,begin_index,end_index)
             if sentence.in_attachment:
@@ -53,10 +55,12 @@ def extract_sentence_list(sentence_list):
 
     return new_sentence2_list, new_sentence2_list_attach
 
-requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程)(的?主要)?(内容|概况|范围|信息)([及与和](其它|\w{,2})要求)?" \
-                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)([::,]|$)"
+requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
+                          "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
+                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)"
 aptitude_pattern = "(资格要求|资质要求)([::,]|$)"
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
+addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
 out_lines = []
 
 def extract_parameters(parse_document, content):
@@ -70,6 +74,7 @@ def extract_parameters(parse_document, content):
     requirement_text = ''
     aptitude_text = ''
     addr_bidopen_text = ''
+    addr_bidsend_text = ''
 
     _find_count = 0
     _data_i = -1
@@ -81,12 +86,10 @@ def extract_parameters(parse_document, content):
         # print(_data.keys())
         if _type=="sentence":
             if _data["sentence_title"] is not None:
-
-                outline = re.sub('(?[一二三四五六七八九十\d.]+)?\s*、?', '',
-                                 re.split('[::,]', _text)[0].replace('(', '(').replace(')', ')'))
+                if re.search('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', _text[:10]):
+                    out_lines.append((_text, _data['sentence_index'], _data['wordOffset_begin']))
 
                 if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求,', _text[:30])==None:
-                    out_lines.append(outline)
                     childs = get_childs([_data])
                     for c in childs:
                         # requirement_text += c["text"]+"\n"
@@ -146,6 +149,12 @@ def extract_parameters(parse_document, content):
                         addr_bidopen_text += c["text"]
                     _data_i += len(childs)
                     _data_i -= 1
+                elif re.search(addr_bidsend_pattern, _text[:20]):
+                    childs = get_childs([_data], max_depth=1)
+                    for c in childs:
+                        addr_bidsend_text += c["text"]
+                    _data_i += len(childs)
+                    _data_i -= 1
     if re.search('时间:', addr_bidopen_text) and re.search('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text):
         for ser in re.finditer('([开评]标|开启|评选|比选|递交\w{,4}文件)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidopen_text):
             b, e = ser.span()
@@ -156,7 +165,11 @@ def extract_parameters(parse_document, content):
         ser = re.search('([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件))?(会议)?地[点址]([((]网址[))])?[:为][^,;。]{2,100}[,;。]', content)
         if ser:
             addr_bidopen_text = ser.group(0)
-    return requirement_text, aptitude_text, addr_bidopen_text
+    if re.search('时间:', addr_bidsend_text) and re.search('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
+        for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
+            b, e = ser.span()
+        addr_bidsend_text = addr_bidsend_text[b:e]
+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines
 
 if __name__ == "__main__":
     # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:

+ 170 - 78
BiddingKG/dl/interface/predictor.py

@@ -29,6 +29,7 @@ import datetime
 from BiddingKG.dl.entityLink.entityLink import get_business_data
 from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
 from BiddingKG.dl.interface.getAttributes import turnMoneySource
+from BiddingKG.dl.common.Utils import del_tabel_achievement
 # import fool   # 统一用 selffool ,阿里云上只有selffool 包
 
 cpu_num = int(os.environ.get("CPU_NUM",0))
@@ -435,6 +436,8 @@ class CodeNamePredict():
                                                     item['code'].append((it, 1, sentence.sentence_index))
                                                 elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                                     item['code'].append((it, 2, sentence.sentence_index))
+                                                elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                                    item['code'].append((it, 2.5, sentence.sentence_index))
                                                 else:
                                                     item['code'].append((it, 3, sentence.sentence_index))
                                         elif len(item['code']) > 0:
@@ -448,6 +451,8 @@ class CodeNamePredict():
                                                     item['code'][-1] = (new_it, 1, sentence.sentence_index)
                                                 elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                                     item['code'][-1] = (new_it, 2, sentence.sentence_index)
+                                                elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                                    item['code'].append((new_it, 2.5, sentence.sentence_index))
                                                 else:
                                                     item['code'][-1] = (new_it, 3, sentence.sentence_index)
                                         else:
@@ -460,10 +465,14 @@ class CodeNamePredict():
                                                     item['code'].append((the_code, 1, sentence.sentence_index))
                                                 elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                                     item['code'].append((the_code, 2, sentence.sentence_index))
+                                                elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                                    item['code'].append((the_code, 2.5, sentence.sentence_index))
                                                 else:
                                                     item['code'].append((the_code, 3, sentence.sentence_index))
                                             break
                                 elif the_code not in code_set:
+                                    if len(the_code)<5: # 避免510545935 这种把 招标项目编号:2024年第二期 只提取2024
+                                        continue
                                     code_set.add(the_code)
                                     # item['code'].append(the_code)
                                     if re.search("(项目编号|招标编号):?$", pre_text[h]):
@@ -472,6 +481,8 @@ class CodeNamePredict():
                                         item['code'].append((the_code, 1, sentence.sentence_index))
                                     elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                         item['code'].append((the_code, 2, sentence.sentence_index))
+                                    elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                        item['code'].append((the_code, 2.5, sentence.sentence_index))
                                     else:
                                         item['code'].append((the_code, 3, sentence.sentence_index))
 
@@ -569,7 +580,7 @@ class CodeNamePredict():
                     # if othercode != None:
                     #     item[1]['code'].append(othercode.group(2))
                     # 2020/11/23 大网站规则调整
-                    othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
+                    othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{6,30}[a-zA-Z0-9\号期])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
                     if othercode != None:
                         # item['code'].append(othercode.group('code'))
                         if re.search("(项目编号|招标编号):?$", othercode.group(0)):
@@ -578,6 +589,8 @@ class CodeNamePredict():
                             item['code'].append((othercode.group('code'), 1, sentence.sentence_index))
                         elif re.search('(询价|合同)编号:?$', othercode.group(0)):
                             item['code'].append((othercode.group('code'), 2, sentence.sentence_index))
+                        elif re.search('(询价|合同|采购|招标|项目)标号:?$', othercode.group(0)):
+                            item['code'].append((othercode.group('code'), 2.5, sentence.sentence_index))
                         else:
                             item['code'].append((othercode.group('code'), 3, sentence.sentence_index))
                         # print('规则召回项目编号:', othercode.group('code'))
@@ -838,9 +851,9 @@ class PREMPredict():
                 elif re.search('尊敬的供应商:$', front):
                     label = 0
                     values[label] = 0.501
-                elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$', front):  #修复第4以上的预测错为中标人
+                elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$|第[4-9四五六七八九十]名', front):  #修复第4以上的预测错为中标人
                     label = 5
-                    values[label] = 0.5
+                    values[2] = 0.5
                 elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
                     values[2] = 0.5
                     label = 5
@@ -1400,7 +1413,7 @@ class RoleRulePredictor():
                                      "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
                                      "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$|(采购商|招标人):(\w{2,10}-)?$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
-        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位)|^关于)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
+        self.pattern_tenderee_right = "(?P<tenderee_right>^(机关)?([((](以下简称)?[,\"“]*((招标|采购)(人|单位|机构)|(服务)?购买方)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位)|^关于)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
         self.pattern_agency_left = "(?P<agency_left>((代理|拍卖)(?:人|机构|公司|企业|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构|(采购|招标)代理)(名称|.{,4}名,?称|全称)?(是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
@@ -1462,9 +1475,9 @@ class RoleRulePredictor():
 
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
-        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|采购成本价|总费用约?为")  # |建安费用 不作为招标金额
+        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[:]+\w{2,4}资金|采购成本价|总费用约?为")  # |建安费用 不作为招标金额
         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):|经评审的价格")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
-        self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
+        self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元(报价)?(中标|中选|成交)")
         self.pattern_money_other = re.compile("代理费|服务费")
         self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
         # self.role_file = open('/data/python/lsm/role_rule_predict.txt', 'a', encoding='utf-8')
@@ -2569,7 +2582,7 @@ class ProductPredictor():
             paths.append(path[1:])
         return paths
 
-    def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
+    def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000, out_lines=[]):
         '''
         预测实体代码,每个句子最多取MAX_AREA个字,超过截断
         :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
@@ -2577,6 +2590,19 @@ class ProductPredictor():
         :param MAX_AREA: 每个句子最多截取多少字
         :return: 把预测出来的实体放进实体类
         '''
+        p = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设|分包)(的?(主要|简要|基本|具体|名称及))?" \
+                          "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况|名称)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
+                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模|(设备|材料|仪器|需求|产品|采购单?)(清单|名称|信息))为?([::,]|$)"
+        sentence_range = []
+        if len(out_lines) >= 3: # 三个以上大纲
+            for i in range(len(out_lines)-1):
+                text, s1, b1 = out_lines[i]
+                _, s2, b2 = out_lines[i+1]
+                if 3<text.find(':')<20:
+                    text = text.split(':')[0]
+                if re.search(p, text[:15]):
+                    sentence_range.append((s1, s2))
+
         with self.sess.as_default() as sess:
             with self.sess.graph.as_default():
                 result = []
@@ -2643,6 +2669,25 @@ class ProductPredictor():
                     if len(list_sentence)==0:
                         result.append({"product":[]})
                         continue
+
+                    if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
+                        new_list = []
+                        word_num = 0
+                        for sentence in list_sentence:
+                            if sentence.sentence_index<2:
+                                new_list.append(sentence)
+                                continue
+                            for s1, s2 in sentence_range:
+                                if sentence.sentence_index < s1:
+                                    continue
+                                elif s1<=sentence.sentence_index <=s2:
+                                    new_list.append(sentence)
+                                    word_num += len(sentence.sentence_text)
+                                elif sentence.sentence_index >= s2:
+                                    break
+                        if word_num > 100:
+                            list_sentence = new_list
+
                     list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
                     _begin_index = 0
                     item = {"product":[]}
@@ -3970,7 +4015,7 @@ class DocChannel():
           '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
           '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
           '合同公告': '(合同(成交|变更)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$', # |(履约|验收)(结果)?
-          '候选人公示': '候选人(变更)?公示|评标(结果)?公示|中标前?公示|中标预公示|评审结果',
+          '候选人公示': '候选人(变更)?公示|评标(结果)?公示|评审结果', #中标前公示|中标预公示|
           '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$|项目中标', # |开标(记录|信息|情况)
           '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
@@ -6352,11 +6397,12 @@ class TablePremExtractor(object):
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码|代码)",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
             "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
-            "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
-            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请|拟推荐(入选|入围)?)?供应商(名称)?$",
+            "win_sort": "排名|排序|名次|推荐顺序",
+            'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因',
+            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟推荐(入选|入围)?)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
-            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
+            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|存放金额",
         }
 
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
@@ -6372,12 +6418,13 @@ class TablePremExtractor(object):
         contain_header = False
         if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
             flag = True
+            need_replace = 0 # 是否需要替换表头名称
             for i in range(len(td_list)) :
                 text = td_list[i]
                 text = re.sub('\s', '', text)
                 if text == '备选中标人':
                     text = '第二候选人'
-                if len(text) > 15: # 长度大于15 不进行表头匹配
+                if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
                     continue
                 if re.search('未(中标|成交)原因', text):  # 不提取此种表格
                     return flag, contain_header, dict()
@@ -6420,27 +6467,18 @@ class TablePremExtractor(object):
                      'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
                 return flag, contain_header, header_dic
             elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
-                if re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_sort' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None:  # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
+                if 'win_sort' in header_dic: # 有排名的 用候选人提取类
+                    return flag, contain_header, dict()
+                elif re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_or_not' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None:  # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
                     # print('只有供应商名称 没排名和包号的去掉')
                     return flag, contain_header, dict()
                 return flag,contain_header, header_dic
+            elif 'tenderer' in header_dic and re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]): # 有中标人,且有明确中标关键词的进行提取
+                return flag, contain_header, header_dic
         elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
             contain_header = True
         return flag, contain_header, dict()
 
-    def is_role(self, text):
-        if len(text) > 25 or len(text)<4:
-            return False
-        elif len(re.findall('有限责?任?公司', text)) > 1:
-            return False
-        elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text):
-            return True
-        else:
-            ners = selffool.ner(text)
-            if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]):
-                return True
-        return False
-
     def get_role(self, text, nlp_enterprise):
         '''
         获取字符串text角色实体
@@ -6451,7 +6489,7 @@ class TablePremExtractor(object):
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
         text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
-        text = re.sub('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
+        text = re.sub('[一二三四五六七八九十]+标段[:]|标段[一二三四五六七八九十]+[:]|第[一二三四五六七八九十]+名[::]', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
         text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
         if text in nlp_enterprise:
             return text
@@ -6484,7 +6522,9 @@ class TablePremExtractor(object):
             or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683;  补充避免423647863采购意向被过滤
             # print('没有包号及角色的不要')
             return {}
-
+        have_bid_amount = False # 是否包含中标金额
+        if "bid_amount" in headers and re.search('[1-9]+', '#'.join([it.strip() for it in df[headers['bid_amount'][0]]])):
+            have_bid_amount = True
         for i in df.index:
             same_package = False  # 连续重复包号,一般是 rowspan 造成;一包 多个采购
             project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
@@ -6495,30 +6535,31 @@ class TablePremExtractor(object):
             budget_ = df.loc[i, headers['budget'][0]].strip() if "budget" in headers else ""
             bid_amount_ = df.loc[i, headers['bid_amount'][0]].strip() if "bid_amount" in headers else ""
             win_sort = df.loc[i, headers['win_sort'][0]].strip() if "win_sort" in headers else ""
+            win_or_not = df.loc[i, headers['win_or_not'][0]].strip() if "win_or_not" in headers else ""
 
             if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_]) & self.headerset != set(): # 只要有一项为表头 停止匹配
                 # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
                 break
-            if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2:  # 内容为空或全部一样 停止匹配
+            if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2 and tenderer=='':  # 内容为空或全部一样 停止匹配
                 # print('内容为空或全部一样 停止匹配')
                 break
             if re.search('详见', project_name):  # 去除某些表达: 详见招标文件
                 project_name = ""
-            if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
+            if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}$', project_name):
                 package_code_raw = project_name
                 project_name = ""
 
             package_code = package_code_raw
             if re.search('合计|总计', package_code+project_code):
                 continue
-            if package_code != '' and package_code + project_code == previous_package:  # 处理 208162730 一个包采购多种东西情况
+            if package_code + project_code == previous_package:  # 处理 208162730 一个包采购多种东西情况
                 same_package = True
-                project_name = ''
+                if previous_package!="": # 有包号或项目编号且跟上一行相同时,去除项目名称
+                    project_name = ''
             previous_package = package_code + project_code
-
             if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取  防止类似 328485591 作为多包
                 break
-            if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and (re.search('否|未(中标|成交|中选)', win_sort) or win_sort==''): # 2024/04/2 修复 252208201 为空的不中标
+            if win_or_not != "" and (re.search('(建议|推荐)(中标|成交|中选)|是|^(中标|成交|中选)', win_or_not)==None or re.search('\w', win_or_not)==None): # 2024/04/2 修复 252208201 为空的不中标
                 continue
             if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
                 continue
@@ -6530,7 +6571,7 @@ class TablePremExtractor(object):
             # tenderee = tenderee if self.is_role(tenderee) else ""
             # tenderer = tenderer if self.is_role(tenderer) else ""
 
-            package = uniform_package_name(package_code) if package_code else '自增'+str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
+            package = uniform_package_name(package_code) if package_code else '自增1' # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
             if project_name != "" and package.startswith('自增'):
                 pk_l = find_package(project_name)
                 if len(pk_l)==1:
@@ -6542,6 +6583,8 @@ class TablePremExtractor(object):
 
             tenderee = self.get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
             tenderer = self.get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
+            tenderee = cut_repeat_name(tenderee)
+            tenderer = cut_repeat_name(tenderer)
 
             if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
                 break
@@ -6612,7 +6655,7 @@ class TablePremExtractor(object):
                         "role_text": tenderee,
                         "serviceTime": ""
                 })
-            if tenderer and not same_package:
+            if tenderer:
                 if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     prem_dic.pop(package)
@@ -6623,25 +6666,40 @@ class TablePremExtractor(object):
                     if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
                         prem_dic.pop(package)
                     continue
+                elif 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and have_bid_amount and bid_amount_ in ['/','','0','0.0']: # 如果不是所有行中标金额都为0,则把为0的做非中标
+                    if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
+                        prem_dic.pop(package)
+                    continue
 
                 bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
                 if (re.search('费率|下浮率|[%%‰折]',
                               bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
                     bid_amount = 0
-                prem_dic[package]['roleList'].append({
-                        "address": "",
-                        "linklist": [],
-                        "role_money": {
-                            "discount_ratio": "",
-                            "downward_floating_ratio": "",
-                            "floating_ratio": "",
-                            "money": bid_amount,
-                            "money_unit": money_unit
-                        },
-                        "role_name": "win_tenderer",
-                        "role_text": tenderer,
-                        "serviceTime": ""
-                })
+                if not same_package or len(prem_dic[package]['roleList'])==0:
+                    prem_dic[package]['roleList'].append({
+                            "address": "",
+                            "linklist": [],
+                            "role_money": {
+                                "discount_ratio": "",
+                                "downward_floating_ratio": "",
+                                "floating_ratio": "",
+                                "money": bid_amount,
+                                "money_unit": money_unit
+                            },
+                            "role_name": "win_tenderer",
+                            "role_text": tenderer,
+                            "serviceTime": ""
+                    })
+                elif prem_dic[package]['roleList'] and prem_dic[package]['roleList'][-1].get('role_name', '')=='win_tenderer':
+                    if 'multi_winner' not in prem_dic[package]['roleList'][-1]:
+                        prem_dic[package]['roleList'][-1]['multi_winner'] = prem_dic[package]['roleList'][-1]['role_text']
+                        prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
+                    elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
+                        prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
+                    if bid_amount != 0: # 有中标金额的才放进去
+                        if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
+                            prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
+                        prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit})
                 tenderer_list.append(tenderer)
             if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
                 prem_dic.pop(package)
@@ -6656,6 +6714,9 @@ class TablePremExtractor(object):
                 for d in v['roleList']:
                     if d['role_name'] == "win_tenderer":
                         total_money += d['role_money']['money']
+                        if 'other_winner_dic' in d:
+                            for other in d['other_winner_dic']:
+                                total_money += other.get('money', 0)
             return {'自增1': {
                 'code': '',
                 'name': '',
@@ -6709,7 +6770,7 @@ class TablePremExtractor(object):
 
             text = table.text.strip()
             previous = table.findPreviousSibling()
-            text2 = previous .text.strip() if previous else ""
+            text2 = previous.text.strip() if previous else ""
             # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
             if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
                 tb_ex = table.extract()
@@ -6729,9 +6790,17 @@ class TablePremExtractor(object):
                     headers = headers_
                     for j in range(i + 1, len(trs)):
                         if len(trs[j]) == len(trs[i]):
-                            flag_, contain_header_, headers_ = self.find_header(trs[j])
-                            if flag_ or contain_header_:
+                            flag_2, contain_header_2, headers_2 = self.find_header(trs[j])
+                            if flag_2 or contain_header_2:
+                                if j == i+1 and flag_2:
+                                    if len(headers_)<=len(headers_2):
+                                        headers = headers_2
+                                    continue
+                                elif trs[i] == trs[j]: # 修复表格重复表头多次出现情况 例:514890585
+                                    continue
                                 break
+                            elif ''.join(trs[j]).strip() == '': # 修复整行为空的 例:514890585
+                                continue
                             else:
                                 table_items.append(trs[j])
                         else:
@@ -6745,11 +6814,11 @@ class TablePremExtractor(object):
                         self.update_prem(table_prem, prem_)
                     i = j - 1
                 i += 1
-            if table_prem and len(trs) == 2 and 'package_code' not in headers and '自增1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
+            if table_prem and 'project_code' not in headers and 'package_code' not in headers and '自增1' in table_prem and table.find_previous_sibling(): # 表格内没有标段的,从上一个兄弟标签找标段
                 sib = table.find_previous_sibling()
                 sib_text = sib.get_text()
-                ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
-                if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
+                ser_sib = re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
+                if sib.name in ['p','div','dl','ol','ul','h1','h2','h3','h4','h5','h6'] and len(sib_text)<100 and ser_sib:
                     package_sib = ser_sib.group(0)
                     package_sib = uniform_package_name(package_sib)
                     table_prem[package_sib] = table_prem.pop('自增1')
@@ -6768,8 +6837,10 @@ class TablePremExtractor(object):
         in_attachment = False
         if richText:
             richText = richText.extract()  # 过滤掉附件
+        del_tabel_achievement(soup) # 20240819 过滤掉业绩表格
         prem = self.get_prem(soup, web_source_name)
         if prem == {} and richText:
+            del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
             prem = self.get_prem(richText, web_source_name)
             in_attachment = True
         if len(prem) == 1:  # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
@@ -6784,10 +6855,10 @@ class CandidateExtractor(object):
         self.head_rule_dic = {
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
-            "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
+            "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论',
-            "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
+            "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
             "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
             "win_tenderer": "第一名|第一(中标|成交)?候选人",
             "second_tenderer": "第二名|第二(中标|成交)?候选人",
@@ -6795,7 +6866,7 @@ class CandidateExtractor(object):
         }
         '''非表格候选人正则'''
         # self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
-        self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?:?$'
+        self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为]?$'
         self.tb = TableTag2List()
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
             self.headerset = pickle.load(f)
@@ -6830,7 +6901,7 @@ class CandidateExtractor(object):
                 if num>1:
                     # print('表头错误,一个td匹配到两个表头:', header_dic)
                     return flag, contain_header, dict()
-            if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
+            if ('candidate' in header_dic and 'win_sort' in header_dic) or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic): # 有排名才返回表头进行提取
                 return flag, contain_header, header_dic
         elif len(set(fix_td_list) & self.headerset) >= 2  or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1):  # 如果包含两个表头以上或 只有两列且包含一个表头
             contain_header = True
@@ -6859,6 +6930,9 @@ class CandidateExtractor(object):
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
         text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
+        text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '',
+                      text)  # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
+        text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text)  # 2024/4/23 去除电话
         if text in nlp_enterprise:
             return text
         if len(text) > 50 or len(text)<4:
@@ -6883,6 +6957,14 @@ class CandidateExtractor(object):
         findmoney = False
         line_num = 0
         line_package = None
+        package_flag = 0
+        if "package_code" in headers:
+            package_flag = 1
+            if len(df)!=len(set(df[headers["package_code"][0]])): # 如果有包号但重复,进行下列判断是否和跟其他字段组合包号
+                if "project_code" in headers and df[headers["project_code"][0]][0] != df[headers["package_code"][0]][0]:
+                    package_flag = 2
+                elif "project_name" in headers and find_package(df[headers["package_code"][0]][0]):
+                    package_flag = 3
         for i in df.index:
             package_code_raw = df.loc[i, headers['package_code'][0]].strip() if "package_code" in headers else ""
             project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
@@ -6932,12 +7014,21 @@ class CandidateExtractor(object):
             if package == "" and project_code != "":  # 修复 395747178 多项目 只提取到一个
                 package = project_code
             package = uniform_package_name(package) if package !="" else "Project"
+            if package_flag == 2 and project_code != "":
+                project_code_pk = uniform_package_name(project_code)
+                package = "%s_%s"%(project_code_pk, package)
+            elif package_flag == 3 and project_name != "":
+                for iter in find_package(project_name):
+                    project_name_pk = uniform_package_name(iter.group(0))
+                    package = "%s_%s"%(project_name_pk, package)
+                    break
+
             if candidate:
                 if win_or_not and re.search('否|未入围', win_or_not):
                     candidate_set.add(candidate)
-                elif re.search('^((建议|推荐)(中标|成交)|是)$', win_or_not) and win_sort in ['', '参与投标单位及排名'] and win_tenderer=='':
-                    win_sort = '第一名'
-                    candidate_set.add(candidate)
+                # elif re.search('^((建议|推荐)(中标|成交)|是)$', win_or_not) and win_sort in ['', '参与投标单位及排名'] and win_tenderer=='':
+                #     win_sort = '第一名'
+                #     candidate_set.add(candidate)
                 else:
                     candidate_set.add(candidate)
 
@@ -7088,7 +7179,7 @@ class CandidateExtractor(object):
             if rs_dic and 'package_code' not in headers and 'Project' in rs_dic and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
                 sib = table.find_previous_sibling()
                 sib_text = sib.get_text()
-                ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
+                ser_sib = re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
                 if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
                     package_sib = ser_sib.group(0)
                     package_sib = uniform_package_name(package_sib)
@@ -7128,8 +7219,10 @@ class CandidateExtractor(object):
         in_attachment = False
         if richText:
             richText = richText.extract()  # 过滤掉附件
+        del_tabel_achievement(soup) # 20240819 过滤掉业绩表格 例:500817166
         prem, candidate_set = self.get_prem(soup)
         if prem == {} and richText:
+            del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
             prem, candidate_set = self.get_prem(richText)
             in_attachment = True
         candidate_set2 = self.get_candidates_from_text(list_sentences, list_entitys)
@@ -7367,7 +7460,7 @@ class ApprovalPredictor():
             if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
                 code_name_set.add(multi_project['project_code']+multi_project['project_name'])
                 district = getPredictor('district').get_area(
-                    multi_project['project_name'] + multi_project['project_addr'], '')
+                    multi_project['approver'] + multi_project['project_name'] + multi_project['project_addr'], '')
                 if district['district']['province'] != '全国':
                     multi_project['area'] = district['district']['area']
                     multi_project['province'] = district['district']['province']
@@ -7379,7 +7472,7 @@ class ApprovalPredictor():
             return rs_l
         elif found_key == 1:
             district = getPredictor('district').get_area(
-                rs_dic['construct_company'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
+                rs_dic['approver'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
             if district['district']['province'] != '全国':
                 rs_dic['area'] = district['district']['area']
                 rs_dic['province'] = district['district']['province']
@@ -7747,18 +7840,17 @@ if __name__=="__main__":
     # rs = product_attr.predict(docid='', html=html, page_time="")
     # print(rs)
 
-    # docid = ""
-    # title = ''
-    # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
-    #     html = f.read()
-    # tb_extract = TablePremExtractor()
-    # rs = tb_extract.predict(html, [
-    #     "广东省广裕集团嘉顺实业有限责任公司",
-    #     "广州顺为招标采购有限公司",
-    #     "中华人民共和国"
-    # ], web_source_name = '河钢供应链管理平台')
-    # print('标段数:',len(rs))
-    # print(rs)
+    docid = ""
+    title = ''
+    with open('d:/html/2.html', 'r', encoding='utf-8') as f:
+        html = f.read()
+    tb_extract = TablePremExtractor()
+    rs = tb_extract.predict(html, [
+        "江苏中联铸本混凝土有限公司",
+        "鼓楼区协荣机械设备经销部"
+    ], web_source_name = '河钢供应链管理平台')
+    print('标段数:',len(rs[0]))
+    print(rs)
 
     # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
     # # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]

+ 58 - 17
BiddingKG/dl/time/re_servicetime.py

@@ -30,22 +30,23 @@ before = '(?P<before>' \
          '|履约期限|合同的?约定完成时限|合同的?完成日期|承诺完成日期' \
          '|合同起始日起|合同的?履约期|履约截止日期|承包期限|合同的?完成日期|特许经营期限' \
          '|服务期间|服务履行期|委托(管理)?期限|经营期限|数量' \
-         '|(工期|服务期限?|交货期限?|服务履行期|合同期限?|履[行约]期限?)说明' \
+         '|(工期|服务期限?|交货期限?|服务履行期|合同期限?|履[行约]期限?)说明|存款期限?|(存款|存放|定存)(期|年)限|服务日期' \
+         '|服务(有效期|年限)|本?合同有效期|协议有效期|项目期限' \
          ')'
 
 
 # ^(?!.*abc).*$ 排除abc字符串
 before_wuye = '(?P<before>' \
-              '(履约期限、地点等简要信息[::]((履约|时间|期限){1,2}[::])?)' \
+              '(履约期限、地点等简要信息[::]((履约|时间|期限){1,2}[::])?)' \
               ')'
 # '|(履约期限、地点等简要信息[^\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,25})' \
 # (履约期限、地点等简要信息.{0,25}(?= [\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]+([年月日]|个月)|20[21]))
 
 before2 = '(?P<before2>' \
-          '自合同签订之日起至|合同签订之日起|自合同签订之日起|签订合同后|系统开发' \
-          '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
-          '|[自从]?合同签[订定]生效之日起|自合同签订后不超过|合同签订日至' \
-          '|合同签订生效之日起' \
+          '自合同签订[次]日起至|合同签订[次]日起|自合同签订[次]日起|签订合同后|系统开发' \
+          '|合同签订[次]日起至|自合同签订[次]日|合同签定后|自签订合同[次]日起|自合同签订起' \
+          '|[自从]?合同签[订定]生效[次]日起|自合同签订后不超过|合同签订日至' \
+          '|合同签订生效[次]日起' \
           '|本项目招标有效期|招标有效期' \
           '|[自从于]?签[订定署字](合同|协议书|协议)并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,4}' \
           '|[自从于]?(采购)?(合同|协议书|协议)(正式)?签[订定署字](完[成毕])?并?(期|开始履行|生效|验收合格|开始服务|期限|有效期|约定){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}' \
@@ -57,7 +58,7 @@ before2 = '(?P<before2>' \
           '|[自从于]服务(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
           '|(本次)?采购周期' \
           '|(项目招标)?履行期|[自从于]?(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,3}' \
-          '|服务(有效期|年限)|本?合同有效期|(正式)?入驻(之[日后]|后|起|算)+' \
+          '|服务(有效期|年限)|本?合同有效期|协议有效期|(正式)?入驻(之[日后]|后|起|算)+' \
           '|(合同|协议书|协议)生效(之[日后]|后|起|算)+' \
           '|自?(提供服务|采购人指定|合同约定)(之[日后]|后|起|算)+' \
           '|本?项目合同期(为|是)*' \
@@ -66,6 +67,29 @@ before2 = '(?P<before2>' \
         # '|[^。]{0,4}[自从于][^。;;,]{0,15}(之[日后]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}?' \
     # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
 
+# before2 用于做开头的表达,需排除一些不合理的
+before2_first = '(?P<before2>' \
+          '自合同签订之日起至|合同签订之日起|自合同签订之日起|签订合同后' \
+          '|合同签订之日起至|自合同签订之日|合同签定后|自签订合同之日起|自合同签订起' \
+          '|[自从]?合同签[订定]生效之日起|自合同签订后不超过|合同签订日至' \
+          '|合同签订生效之日起' \
+          '|本项目招标有效期|招标有效期' \
+          '|[自从于]?签[订定署字](合同|协议书|协议)并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,4}' \
+          '|[自从于]?(采购)?(合同|协议书|协议)(正式)?签[订定署字](完[成毕])?并?(期|开始履行|生效|验收合格|开始服务|期限|有效期|约定){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}' \
+          '|服务要求' \
+          '|签订合同起' \
+          '|项目的有效期限为|项目服务为|签订合同期为' \
+          '|(合同|协议书)签[订定署字]生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
+          '|[自从于]服务(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
+          '|(本次)?采购周期' \
+          '|(项目招标)?履行期|[自从于]?(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,3}' \
+          '|服务(有效期|年限)|本?合同有效期|协议有效期|(正式)?入驻(之[日后]|后|起|算)+' \
+          '|(合同|协议书|协议)生效(之[日后]|后|起|算)+' \
+          '|自?(提供服务|采购人指定|合同约定)(之[日后]|后|起|算)+' \
+          '|本?项目合同期(为|是)*' \
+          '|交付使用(之[日后]|后|起|算)+|' \
+          ')'
+
 before3 = '(?P<before3>' \
           ',?([\((](日历天|施工时间|单位)[\))]|[\((]天[\))]?|[\((]年[\))]?|[\((]月[\))]?)?' \
           ')'
@@ -136,7 +160,7 @@ reg2 = re.compile(before + before3 + before7 + charac + before5 + before2 + befo
 
 reg3 = re.compile(before + before3 + before7 + charac + before5 + before2 + after2)
 
-reg4 = re.compile(before2[:-2]+before2[-1:] + before5 + center + after)
+reg4 = re.compile(before2_first[:-2]+before2_first[-1:] + before5 + center + after)
 
 reg5 = re.compile(before + before3 + before7 + charac + before5 + before2 + before4 + before6 + center2 + after)
 
@@ -229,18 +253,18 @@ def re_service_time(text):
             prob = 0.8
 
         if len(output_list) == 0:
-            output_list, text_index_list = re_find_all_result(reg4, input_str)
+            output_list, text_index_list = re_find_all_result(reg5, input_str)
             if TEST_MODE:
-                print("output_str, text_index reg4", output_list, text_index_list)
+                print("output_str, text_index reg5", output_list, text_index_list)
             output_list, text_index_list = filter_service_time(output_list, text_index_list)
-            prob = 0.5
+            prob = 0.8
 
         if len(output_list) == 0:
-            output_list, text_index_list = re_find_all_result(reg5, input_str)
+            output_list, text_index_list = re_find_all_result(reg4, input_str)
             if TEST_MODE:
-                print("output_str, text_index reg5", output_list, text_index_list)
+                print("output_str, text_index reg4", output_list, text_index_list)
             output_list, text_index_list = filter_service_time(output_list, text_index_list)
-            prob = 0.8
+            prob = 0.5
 
         # 添加
         all_output_list += output_list
@@ -298,7 +322,7 @@ def filter_service_time(output_list, text_index_list):
         if not re.findall(reg_right_unit, output) and not re.match('^\d{1,3}$', output):
             delete_list.append([output, text_index_list[i]])
             continue
-        if not re.findall("[^之]日|天|年|月|周|星期", output) or re.search("\d{4}[\-\./]\d{1,2}", output):
+        if not (re.findall("[^之]日|天|年|月|周|星期", output) or re.search("\d{4}[\-\./]\d{1,2}", output)):
             delete_list.append([output, text_index_list[i]])
             continue
         # 包含不要的字
@@ -362,7 +386,22 @@ def re_find_all_result(reg, input, unit="", index=0):
         if re.search("数量",i.group()) and not re.search("[年月日天周]",input[i.start()+front_len: i.end()]):
             continue
         # 前述表达有排除词的跳过
-        if re.search("公告|发布",input[i.start():i.start()+front_len]):
+        if re.search("公告|发布|公示",input[i.start():i.start()+front_len]):
+            continue
+        # ‘服务日期’只保留x年的
+        if re.search("服务日期", input[i.start():i.start() + front_len]) \
+            and (re.search('[日月]',input[i.start()+front_len: i.end()]) or not re.search('年',input[i.start()+front_len: i.end()])):
+            continue
+        # 排除某些容易错误的表达
+        if re.search('^(自合同签订[之次]日起至|合同签订[之次]日起|自合同签订[之次]日起|签订合同后' \
+              '|合同签订[之次]日起至|自合同签订[之次]日|合同签定后|自签订合同[之次]日起|自合同签订起' \
+              '|[自从]?合同签[订定]生效[之次]日起|自合同签订后不超过|合同签订日至' \
+              '|合同签订生效[之次]日起|签订合同起' \
+              '|[自从于]?签[订定署字](合同|协议书|协议)并?(期|开始履行|生效|有效期|约定|验收合格|期限|开始服务){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,4}' \
+              '|[自从于]?(采购)?(合同|协议书|协议)(正式)?签[订定署字](完[成毕])?并?(期|开始履行|生效|验收合格|开始服务|期限|有效期|约定){0,2}(之[日后]|日期?[后起]|后|起|算|为)+[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{0,5}' \
+              '|(合同|协议书)签[订定署字]生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
+              '|[自从于]服务(合同|协议书|协议)生效(之[日后]|后|起)[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
+              ')',input[i.start():i.start() + front_len]):
             continue
 
         text_index.append([i.start()+front_len, i.end()])
@@ -409,7 +448,9 @@ def test_from_str():
         "服务时间:2023年12-2024年12," \
         "第十四条,服务时间:2023.12-2024.12,。" \
         "第十四条本合同自2023年3月1日起至2024年2月29日止。" \
-        "二、垃圾清运委外期限,垃圾委外清运时间为1年,自2023年1月1日起至2023年12月31日止。"
+        "服务时间:自2022年10月1日至2023年9月3日" \
+        "二、垃圾清运委外期限,垃圾委外清运时间为1年,自2023年1月1日起至2023年12月31日止。" \
+        "服务时间:预计从2022年11月起,12个月。"
     # s = '第十四条,服务时间:2023.12-2024.12服务时间'
 #     s = ''',莆田市财政局走廊及卫生间吊顶改造工程中标结果公告,莆田市财政局走廊及卫生间吊顶改造工程,工程预算价236878元,发包价194240元,招标编号为:宏福莆招字【2020】H001号,该项目招标方式为:邀请招标。2020年04月07日开标,2020年04月07日评标完成,中标主要结果公示如下:中标人名称,福建省东海伟业建设有限公司,中标价:194240元,评标办法,随机抽取法,资格评审结果,注册建造师:合格:余爱华(注册编号:闽235141578763),履约保证金(元):合格:合同金额的10%,施工工期:14日历天,工程质量,备注,被确定为废标、无效标的投标人及原因:合格:无废标,资格审查小组:合格:王宗仙、林慧灵、谢淑青,根据评标结果确定福建省东海伟业建设有限公司为中标人,现在莆田市财政局网上(http://czj.putian.gov.cn/)公示。中标公示期自2020年04月08日至2020年04月10日。投标人对中标结果有异议或认为评标活动存在违法违规行为,可在公示期内向相关主管部门投诉,招标单位:招标代理机构:莆田市财政局,福建省宏福工程管理有限公司,联系电话:0594-2694413,联系电话:15160467775,2020年04月08日,2020年04月08日,
 # '''