luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
							import codecs
import re

from bs4 import BeautifulSoup
from BiddingKG.dl.table_head.models.model import get_model
from BiddingKG.dl.table_head.post_process import table_post_process
from BiddingKG.dl.table_head.pre_process import my_data_loader, table_pre_process
from BiddingKG.dl.interface.Preprocessing import tableToText, segment

# init model
input_shape = (6, 10, 60)
output_shape = (1,)
model = get_model(input_shape, output_shape)

# load weights
model_path = "checkpoints/best.hdf5"
model.load_weights(model_path)


def predict(table_text_list):
    # 表格单元格数*2 即为单次预测batch_size
    data_list = table_pre_process(table_text_list, [], 0, is_train=False)
    batch_size = len(data_list)
    # print("batch_size", batch_size)

    # 数据预处理
    predict_x = my_data_loader(data_list, [], batch_size, is_train=False)

    # 预测
    predict_result = model.predict_generator(predict_x, steps=1)
    # print("predict_result", predict_result.shape)

    # 数据后处理
    table_label_list = table_post_process(table_text_list, predict_result)
    return table_label_list


def predict_html():
    def get_trs(tbody):
        #获取所有的tr
        trs = []
        objs = tbody.find_all(recursive=False)
        for obj in objs:
            if obj.name=="tr":
                trs.append(obj)
            if obj.name=="tbody":
                for tr in obj.find_all("tr",recursive=False):
                    trs.append(tr)
        return trs

    def get_table(tbody):
        trs = get_trs(tbody)
        inner_table = []
        for tr in trs:
            tr_line = []
            tds = tr.findChildren(['td', 'th'], recursive=False)
            if len(tds) == 0:
                tr_line.append(re.sub('\xa0', '', segment(tr,final=False))) # 2021/12/21 修复部分表格没有td 造成数据丢失
            for td in tds:
                tr_line.append(re.sub('\xa0', '', segment(td,final=False)))
            inner_table.append(tr_line)
        return inner_table

    def fix_table(inner_table, fix_value=""):
        maxWidth = 0
        for item in inner_table:
            if len(item)>maxWidth:
                maxWidth = len(item)
        for i in range(len(inner_table)):
            if len(inner_table[i])<maxWidth:
                for j in range(maxWidth-len(inner_table[i])):
                    inner_table[i].append(fix_value)
        return inner_table

    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
    content = str(BeautifulSoup(text).find("div",id="pcontent"))
    soup = BeautifulSoup(content, 'lxml')

    table_list = []
    tbodies = soup.find_all('tbody')
    for tbody_index in range(1,len(tbodies)+1):
        tbody = tbodies[len(tbodies)-tbody_index]
        table_list.append(tbody)

    table_fix_list = []
    for tbody in table_list:
        inner_table = get_table(tbody)
        inner_table = fix_table(inner_table)
        table_fix_list.append(inner_table)

    for table in table_fix_list:
        print("="*30)
        print(table)
        print(predict(table))


if __name__ == '__main__':
    # _str = "[['序号', '投标人名称', '价格得分', '技术得分', '商务得分', '综合得分', '排名'], " \
    #        "['序号', '投标人名称', '比例(20%)，', '比例(45%)，', '比例(35%)，', '100%', '排名'], " \
    #        "['1', '广州中科雅图信息技术有限公司', '19.71', '11.50', '11.00', '42.21', '3'], " \
    #        "['2', '核工业赣州工程勘察院', '19.64', '15.00', '11.00', '45.64', '2'], " \
    #        "['3', '广东晟腾地信科技有限公司', '20.00', '16.17', '14.00', '50.17', '1']]"
    #
    # data_list = eval(_str)
    # print("len(data_list)", len(data_list))
    # predict(data_list)

    predict_html()