123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108 |
- import codecs
- import re
- from bs4 import BeautifulSoup
- from BiddingKG.dl.table_head.models.model import get_model
- from BiddingKG.dl.table_head.post_process import table_post_process
- from BiddingKG.dl.table_head.pre_process import my_data_loader, table_pre_process
- from BiddingKG.dl.interface.Preprocessing import tableToText, segment
- # init model
- input_shape = (6, 10, 60)
- output_shape = (1,)
- model = get_model(input_shape, output_shape)
- # load weights
- model_path = "checkpoints/best.hdf5"
- model.load_weights(model_path)
- def predict(table_text_list):
- # 表格单元格数*2 即为单次预测batch_size
- data_list = table_pre_process(table_text_list, [], 0, is_train=False)
- batch_size = len(data_list)
- # print("batch_size", batch_size)
- # 数据预处理
- predict_x = my_data_loader(data_list, [], batch_size, is_train=False)
- # 预测
- predict_result = model.predict_generator(predict_x, steps=1)
- # print("predict_result", predict_result.shape)
- # 数据后处理
- table_label_list = table_post_process(table_text_list, predict_result)
- return table_label_list
- def predict_html():
- def get_trs(tbody):
- #获取所有的tr
- trs = []
- objs = tbody.find_all(recursive=False)
- for obj in objs:
- if obj.name=="tr":
- trs.append(obj)
- if obj.name=="tbody":
- for tr in obj.find_all("tr",recursive=False):
- trs.append(tr)
- return trs
- def get_table(tbody):
- trs = get_trs(tbody)
- inner_table = []
- for tr in trs:
- tr_line = []
- tds = tr.findChildren(['td', 'th'], recursive=False)
- if len(tds) == 0:
- tr_line.append(re.sub('\xa0', '', segment(tr,final=False))) # 2021/12/21 修复部分表格没有td 造成数据丢失
- for td in tds:
- tr_line.append(re.sub('\xa0', '', segment(td,final=False)))
- inner_table.append(tr_line)
- return inner_table
- def fix_table(inner_table, fix_value=""):
- maxWidth = 0
- for item in inner_table:
- if len(item)>maxWidth:
- maxWidth = len(item)
- for i in range(len(inner_table)):
- if len(inner_table[i])<maxWidth:
- for j in range(maxWidth-len(inner_table[i])):
- inner_table[i].append(fix_value)
- return inner_table
- text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
- content = str(BeautifulSoup(text).find("div",id="pcontent"))
- soup = BeautifulSoup(content, 'lxml')
- table_list = []
- tbodies = soup.find_all('tbody')
- for tbody_index in range(1,len(tbodies)+1):
- tbody = tbodies[len(tbodies)-tbody_index]
- table_list.append(tbody)
- table_fix_list = []
- for tbody in table_list:
- inner_table = get_table(tbody)
- inner_table = fix_table(inner_table)
- table_fix_list.append(inner_table)
- for table in table_fix_list:
- print("="*30)
- print(table)
- print(predict(table))
- if __name__ == '__main__':
- # _str = "[['序号', '投标人名称', '价格得分', '技术得分', '商务得分', '综合得分', '排名'], " \
- # "['序号', '投标人名称', '比例(20%),', '比例(45%),', '比例(35%),', '100%', '排名'], " \
- # "['1', '广州中科雅图信息技术有限公司', '19.71', '11.50', '11.00', '42.21', '3'], " \
- # "['2', '核工业赣州工程勘察院', '19.64', '15.00', '11.00', '45.64', '2'], " \
- # "['3', '广东晟腾地信科技有限公司', '20.00', '16.17', '14.00', '50.17', '1']]"
- #
- # data_list = eval(_str)
- # print("len(data_list)", len(data_list))
- # predict(data_list)
- predict_html()
|