Переглянути джерело

新增表格多包要素提取;包号规范化调整;附件异常表格过滤

lsm 2 роки тому
батько
коміт
89c9ed07b1

+ 82 - 0
BiddingKG/dl/common/Utils.py

@@ -766,6 +766,88 @@ def partMoney(entity_text,input2_shape = [7]):
         parts[6] = 1
     return parts
 
+def uniform_num(num):
+    d1 = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '10'}
+    # d2 = {'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5', 'F': '6', 'G': '7', 'H': '8', 'I': '9', 'J': '10'}
+    d3 = {'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5', 'Ⅵ': '6', 'Ⅶ': '7'}
+    if num.isdigit():
+        if re.search('^0[\d]$', num):
+            num = num[1:]
+        return num
+    elif re.search('^[一二三四五六七八九十]+$', num):
+        _digit = re.search('^[一二三四五六七八九十]+$', num).group(0)
+        if len(_digit) == 1:
+            num = d1[_digit]
+        elif len(_digit) == 2 and _digit[0] == '十':
+            num = '1'+ d1[_digit[1]]
+        elif len(_digit) == 2 and _digit[1] == '十':
+            num = d1[_digit[0]] + '0'
+        elif len(_digit) == 3 and _digit[1] == '十':
+            num = d1[_digit[0]] + d1[_digit[2]]
+    elif re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num):
+        num = re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num).group(0)
+        num = d3[num]
+    return num
+
+def uniform_package_name(package_name):
+    '''
+    统一规范化包号。数值类型统一为阿拉伯数字,字母统一为大写,包含施工监理等抽到前面, 例 A包监理一标段 统一为 监理A1 ; 包Ⅱ 统一为 2
+    :param package_name: 字符串类型 包号
+    :return:
+    '''
+    package_name_raw = package_name
+    package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name)
+    kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name)
+    name = ""
+    if kw:
+        name += kw.group(0)
+    if re.search('[a-zA-Z0-9-]{5,}$', package_name):   # 五个字符以上编号
+        _digit = re.search('[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
+        name += _digit
+    elif re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
+        ser = re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)
+        _char = ser.groupdict().get('eng')
+        if _char:
+            _char = _char.upper()
+        _digit = ser.groupdict().get('num')
+        _digit = uniform_num(_digit)
+        name += _char.upper() + _digit
+    elif re.search('第?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|([分子]?[包标]))', package_name): # 处理类似 A包2标段
+        ser = re.search('第?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|([分子]?[包标]))', package_name)
+        _char = ser.groupdict().get('eng')
+        if _char:
+            _char = _char.upper()
+        _digit = ser.groupdict().get('num')
+        _digit = uniform_num(_digit)
+        if _char:
+            name += _char.upper()
+        name += _digit
+    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name):  # 数字的统一的阿拉伯数字
+        ser = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
+        _char = ser.groupdict().get('eng')
+        if _char:
+            _char = _char.upper()
+        _digit = ser.groupdict().get('num')
+        _digit = uniform_num(_digit)
+        if _char:
+            name += _char.upper()
+        name += _digit
+    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})', package_name):  # 数字的统一的阿拉伯数字
+        _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})', package_name).group('eng').upper()
+        name += _digit
+    elif re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name):  # 数字的统一的阿拉伯数字
+        _digit = re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper()
+        name += _digit
+    elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name):  # 数字的统一的阿拉伯数字
+        _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
+        _digit = uniform_num(_digit)
+        name += _digit
+    if name == "":
+        return package_name_raw
+    else:
+        # print('原始包号:%s, 处理后:%s'%(package_name, name))
+        return name
+
 def recall(y_true, y_pred):
     '''
     计算召回率

+ 3 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -1007,7 +1007,7 @@ def tableToText(soup):
             for _tr in _tbody.find_all(recursive=False):
                 len_td = len(_tr.find_all(recursive=False))
                 _td_len_list.append(len_td)
-            if len(list(set(_td_len_list)))>8:
+            if len(list(set(_td_len_list))) >= 8 or max(_td_len_list) > 100:
                 return None
         fixSpan(tbody)
         inner_table = getTable(tbody)
@@ -2473,8 +2473,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
-                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
+                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
                                   "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
             # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
 

+ 8 - 0
BiddingKG/dl/interface/extract.py

@@ -199,6 +199,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     log("get attributes done of doc_id%s"%(doc_id))
     cost_time["attrs"] = round(time.time()-start_time,2)
 
+    '''表格要素提取'''
+    table_prem = predictor.getPredictor("tableprem").predict(text)
+    if table_prem:
+        if 'Project' in prem[0]['prem']:
+            table_prem.update({'Project': prem[0]['prem']['Project']})
+        prem[0]['prem'] = table_prem
+
+
     '''获取联合体信息'''
     getAttributes.get_win_joint(prem, list_entitys, list_sentences, list_articles)
 

+ 3 - 53
BiddingKG/dl/interface/getAttributes.py

@@ -1,6 +1,6 @@
 
 
-from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL
+from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name
 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
 from decimal import Decimal
 import re
@@ -802,7 +802,7 @@ def getPackagesFromArticle(list_sentence, list_entity):
     dict_packageCode = dict()
 
     package_number_pattern =  re.compile(
-    '((施工|监理|监测|勘察|设计)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标段?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]')  # 标号
+    '((施工|监理|监测|勘察|设计|劳务)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标段?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]')  # 标号
     other_package_pattern = re.compile(
         '((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]')  # # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
     win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]')  # 2020/11/23 大网站规则 调整
@@ -860,54 +860,6 @@ def getPackagesFromArticle(list_sentence, list_entity):
                 packageCode = the_iter.group(1)
         return packageCode
 
-    def uniform_num(num):
-        d1 = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '10'}
-        # d2 = {'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5', 'F': '6', 'G': '7', 'H': '8', 'I': '9', 'J': '10'}
-        d3 = {'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5', 'Ⅵ': '6', 'Ⅶ': '7'}
-        if num.isdigit():
-            if re.search('^0[\d]$', num):
-                num = num[1:]
-            return num
-        elif re.search('^[一二三四五六七八九十]+$', num):
-            _digit = re.search('^[一二三四五六七八九十]+$', num).group(0)
-            if len(_digit) == 1:
-                num = d1[_digit]
-            elif len(_digit) == 2 and _digit[0] == '十':
-                num = '1'+ d1[_digit[1]]
-            elif len(_digit) == 3 and _digit[1] == '十':
-                num = d1[_digit[0]] + d1[_digit[2]]
-        elif re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num):
-            num = re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num).group(0)
-            num = d3[num]
-        return num
-
-    def uniform_package_name(package_name):
-        package_name = re.sub('pdf|doc|docs|xlsx', '', package_name)
-        kw = re.search('(施工|监理|监测|勘察|设计)', package_name)
-        name = ""
-        if kw:
-            name += kw.group(0)
-        if re.search('([a-zA-Z]包[:)]?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}标段?)', package_name): # 处理类似 A包2标段
-            _char = re.search('[a-zA-Z]', package_name).group(0)
-            _digit = re.search('[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}', package_name).group(0)
-            _digit = uniform_num(_digit)
-            name += _char + _digit
-        elif re.search('[a-zA-Z0-9-]{5,}', package_name):   # 五个字符以上编号
-            _digit = re.search('[a-zA-Z0-9-]{5,}', package_name).group(0).upper()
-            name += _digit
-        elif re.search('[a-zA-Z]{1,4}[0-9]{,3}', package_name):  # 英文的统一为大写
-            _digit = re.search('[a-zA-Z]{1,4}[0-9]{,3}', package_name).group(0).upper()
-            name += _digit
-        elif re.search('[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}', package_name):  # 数字的统一的阿拉伯数字
-            _digit = re.search('[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}', package_name).group(0)
-            _digit = uniform_num(_digit)
-            name += _digit
-        if name == "":
-            return package_name
-        else:
-            # print('原始包号:%s, 处理后:%s'%(package_name, name))
-            return name
-
     def get_package():
         PackageList_scope = []
         True_package = set()
@@ -921,7 +873,7 @@ def getPackagesFromArticle(list_sentence, list_entity):
                 if re.match('\d', iter.group(0)) and iter.end() < len(content) and content[
                     iter.end()].isdigit():  # 排除2.10标段3 这种情况
                     continue
-                if re.search('承包|XX|xx', iter.group(0)) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
+                if re.search('[每书/]包|XX|xx', iter.group(0)) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
                     continue
                 temp_package_number = uniform_package_name(iter.group(0))
                 True_package.add(temp_package_number)
@@ -1063,12 +1015,10 @@ def getPackagesFromArticle(list_sentence, list_entity):
         return PackageList
 
     PackageList_scope, True_package = get_package()
-
     PackageList_scope2, True_package2 = get_win_project()
     if len(True_package2) > 2: # 同时包含多标段及多中标人的
         PackageList_scope = PackageList_scope + PackageList_scope2
     PackageList = get_package_scope(PackageList_scope)
-
     return PackageList, PackageSet, dict_packageCode
 
 

BIN
BiddingKG/dl/interface/header_set.pkl


+ 301 - 1
BiddingKG/dl/interface/predictor.py

@@ -53,7 +53,8 @@ dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
                   "industry": {"predictor": None, "Lock": RLock()},
                   "rolegrade": {"predictor": None, "Lock": RLock()},
                   "moneygrade": {"predictor": None, "Lock": RLock()},
-                  "district": {"predictor": None, "Lock": RLock()}
+                  "district": {"predictor": None, "Lock": RLock()},
+                  'tableprem': {"predictor": None, "Lock": RLock()},
                   }
 
 
@@ -97,6 +98,8 @@ def getPredictor(_type):
                     dict_predictor[_type]["predictor"] = MoneyGrade()
                 if _type == 'district':
                     dict_predictor[_type]["predictor"] = DistrictPredictor()
+                if _type == 'tableprem':
+                    dict_predictor[_type]["predictor"] = TablePremExtractor()
             return dict_predictor[_type]["predictor"]
     raise NameError("no this type of predictor")
 
@@ -4317,6 +4320,303 @@ class DistrictPredictor():
                 rs = rs2
         return rs
 
+class TablePremExtractor(object):
+    def __init__(self):
+        '''各要素表头规则'''
+        self.head_rule_dic = {
+            'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
+            'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
+            "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程)(名称?|内容)",
+            "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
+            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
+            "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
+            "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
+            "bid_amount": "投标[报总]价|(中标|成交)([金总]额|[报均总]价|价[格款])|承包价",
+        }
+
+        with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
+            self.headerset = pickle.load(f)
+
+    def table2list(self, table):
+        self._output = []
+        row_ind = 0
+        col_ind = 0
+        for row in table.find_all('tr'):
+            # record the smallest row_span, so that we know how many rows
+            # we should skip
+            smallest_row_span = 1
+
+            for cell in row.children:
+                if cell.name in ('td', 'th'):
+                    # check multiple rows
+                    # pdb.set_trace()
+                    row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and re.search('[0-9]', cell.get('rowspan')) else 1
+
+                    # try updating smallest_row_span
+                    smallest_row_span = min(smallest_row_span, row_span)
+
+                    # check multiple columns
+                    col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and re.search('[0-9]', cell.get('colspan')) else 1
+
+                    # find the right index
+                    while True:
+                        if self._check_cell_validity(row_ind, col_ind):
+                            break
+                        col_ind += 1
+
+                    # insert into self._output
+                    try:
+                        text = str(cell.get_text()).replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
+                        text = re.sub('\s', '', text)[:200] # 只需取前200字即可
+                        self._insert(row_ind, col_ind, row_span, col_span, text)
+                    except UnicodeEncodeError:
+                        raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' )
+
+                    # update col_ind
+                    col_ind += col_span
+                    if col_ind > 50: # 表格列数大于50的去掉
+                        return []
+
+            # update row_ind
+            row_ind += smallest_row_span
+            col_ind = 0
+        return self._output
+
+    def _check_validity(self, i, j, height, width):
+        """
+        check if a rectangle (i, j, height, width) can be put into self.output
+        """
+        return all(self._check_cell_validity(ii, jj) for ii in range(i, i+height) for jj in range(j, j+width))
+
+    def _check_cell_validity(self, i, j):
+        """
+        check if a cell (i, j) can be put into self._output
+        """
+        if i >= len(self._output):
+            return True
+        if j >= len(self._output[i]):
+            return True
+        if self._output[i][j] is None:
+            return True
+        return False
+
+    def _insert(self, i, j, height, width, val):
+        # pdb.set_trace()
+        for ii in range(i, i+height):
+            for jj in range(j, j+width):
+                self._insert_cell(ii, jj, val)
+
+    def _insert_cell(self, i, j, val):
+        while i >= len(self._output):
+            self._output.append([])
+        while j >= len(self._output[i]):
+            self._output[i].append("")
+
+        if self._output[i][j] == "":
+            self._output[i][j] = val
+
+    def find_header(self, td_list):
+        header_dic = dict()
+        flag = False
+        if len(set(td_list))>2 and len(set(td_list) & self.headerset)/len(set(td_list))>0.6:
+            flag = True
+            for i in range(len(td_list)) :
+                text = td_list[i]
+                if len(text) > 15: # 长度大于15 不进行表头匹配
+                    continue
+                if re.search('未(中标|成交)原因', text):  # 不提取此种表格
+                    return flag, dict()
+                num = 0
+                for k, v in self.head_rule_dic.items():
+                    if re.search(v, text):
+                        header_dic[k] = (i, text)
+                        num += 1
+                if num>1:
+                    print('表头错误,一个td匹配到两个表头:', header_dic)
+                    return flag, dict()
+            if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
+                    'budget' in header_dic or 'tenderer' in header_dic):
+                return flag, header_dic
+        return flag, dict()
+
+    def is_role(self, text):
+        if len(text) > 25 or len(text)<5:
+            return False
+        elif len(re.findall('有限责?任?公司', text)) > 1:
+            return False
+        elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text):
+            return True
+        else:
+            ners = selffool.ner(text)
+            if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]):
+                return True
+        return False
+
+    def extract_from_df(self, df, headers):
+        prem_dic = {}
+        previous_package = ""  # 上一行包号
+        multi_same_package = False # 非连续的重复包号
+        package_fix2raw = dict()  # 处理后包号:处理前包号 字典
+        link_set = set()
+        for i in df.index:
+            same_package = False  # 连续重复包号,一般是 rowspan 造成;一包 多个采购
+            project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
+            package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
+            project_name = df.loc[i, headers['project_name'][0]] if "project_name" in headers else ""
+            tenderee = df.loc[i, headers['tenderee'][0]] if "tenderee" in headers else ""
+            tenderer = df.loc[i, headers['tenderer'][0]] if "tenderer" in headers else ""
+            budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
+            bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
+            win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
+
+            package_code = package_code_raw
+            if re.search('合计|总计', package_code+project_code):
+                continue
+            if package_code != '' and package_code == previous_package:  # 处理 208162730 一个包采购多种东西情况
+                same_package = True
+                project_name = ''
+            previous_package = package_code
+
+            if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
+                continue
+            if win_sort != "" and re.search('是否中标', headers['win_sort'][1]) and re.search('否', win_sort) == None:
+                continue
+            if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and 'bid_amount' in headers and re.search('(中标|成交)价', headers['bid_amount'][1])==None:
+                tenderer = ""
+
+            tenderee = tenderee if self.is_role(tenderee) else ""
+            tenderer = tenderer if self.is_role(tenderer) else ""
+
+            if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
+                break
+            if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
+                continue
+            link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
+
+            package = package_code if package_code else str(i+1)
+            package = uniform_package_name(package)
+
+            if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
+                package_fix2raw[package] = package_code_raw
+            elif same_package == False:
+                multi_same_package = True
+            if multi_same_package:
+                package = package_code_raw
+            if package not in prem_dic or not same_package:
+                prem_dic[package] = {
+                    'code': '',
+                    'name': '',
+                    'roleList': [],
+                    'tendereeMoney': 0,
+                    'tendereeMoneyUnit': ""
+                }
+
+            prem_dic[package]['code'] = project_code
+            prem_dic[package]['name'] = project_name
+            re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", budget_)
+            if re_price:
+                budget_ = re_price[0]
+                if '万元' in headers['budget'][1] and '万' not in budget_:
+                    budget_ += '万元'
+                budget = float(str(getUnifyMoney(budget_)))
+                if budget > 10000000000000: # 大于万亿的去除
+                    budget = 0
+                if same_package and prem_dic[package]['tendereeMoney'] != budget: #
+                    prem_dic[package]['tendereeMoney'] += budget
+                else:
+                    prem_dic[package]['tendereeMoney'] = budget
+                prem_dic[package]['tendereeMoneyUnit'] = '万元' if '万' in budget_ else '元'
+            if tenderee and not same_package:
+                prem_dic[package]['roleList'].append({
+                        "address": "",
+                        "linklist": [],
+                        "role_money": {
+                            "discount_ratio": "",
+                            "downward_floating_ratio": "",
+                            "floating_ratio": "",
+                            "money": 0,
+                            "money_unit": ""
+                        },
+                        "role_name": "tenderee",
+                        "role_text": tenderee,
+                        "serviceTime": ""
+                })
+            if tenderer and not same_package:
+                bid_amount = 0
+                money_unit = ""
+                re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", bid_amount_)
+                if re_price:
+                    bid_amount_ = re_price[0]
+                    if '万元' in headers['bid_amount'][1] and '万' not in bid_amount_:
+                        bid_amount_ += '万元'
+                    bid_amount = float(str(getUnifyMoney(bid_amount_)))
+                    if bid_amount > 10000000000000:  # 大于万亿的去除
+                        bid_amount = 0
+                    money_unit = '万元' if '万' in bid_amount_ else '元'
+                prem_dic[package]['roleList'].append({
+                        "address": "",
+                        "linklist": [],
+                        "role_money": {
+                            "discount_ratio": "",
+                            "downward_floating_ratio": "",
+                            "floating_ratio": "",
+                            "money": bid_amount,
+                            "money_unit": money_unit
+                        },
+                        "role_name": "win_tenderer",
+                        "role_text": tenderer,
+                        "serviceTime": ""
+                })
+            if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃
+                prem_dic.pop(package)
+            if multi_same_package:
+                for k, v in package_fix2raw.items():
+                    if k in prem_dic:
+                        prem_dic[v] = prem_dic.pop(k)
+        return prem_dic
+
+    def get_prem(self, soup):
+        tables = soup.find_all('table')
+        tables.reverse()
+        rs_dic = {}
+        for table in tables:
+            trs = self.table2list(table)
+            table.extract()
+            i = 0
+            headers = ""
+            while i < len(trs) - 1:
+                flag_, headers_ = self.find_header(trs[i])
+                if flag_ and headers_ != dict():
+                    table_items = []
+                    headers = headers_
+                    for j in range(i + 1, len(trs)):
+                        if len(trs[j]) == len(trs[i]):
+                            flag_, headers_ = self.find_header(trs[j])
+                            if flag_:
+                                break
+                            else:
+                                table_items.append(trs[j])
+                        else:
+                            print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
+                            break
+                    if len(table_items) > 1:
+                        df = pd.DataFrame(table_items)
+                        prem_ = self.extract_from_df(df, headers)
+                        rs_dic.update(prem_)
+                    i = j - 1
+                i += 1
+        return rs_dic
+
+    def predict(self, html):
+        soup = BeautifulSoup(html, 'lxml')
+        richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
+        if richText:
+            richText = richText.extract()  # 过滤掉附件
+        prem = self.get_prem(soup)
+        if prem == {} and richText:
+            prem = self.get_prem(richText)
+        return prem
+
 
 def getSavedModel():
     #predictor = FormPredictor()