|
@@ -26,7 +26,7 @@ from BiddingKG.dl.entityLink.entityLink import *
|
|
|
|
|
|
|
|
|
|
#
|
|
#
|
|
-def tableToText(soup, docid=None):
|
|
|
|
|
|
+def tableToText(soup, docid=None, return_kv=False):
|
|
'''
|
|
'''
|
|
@param:
|
|
@param:
|
|
soup:网页html的soup
|
|
soup:网页html的soup
|
|
@@ -1289,7 +1289,359 @@ def tableToText(soup, docid=None):
|
|
# text += rank_text+entity_text+text_line
|
|
# text += rank_text+entity_text+text_line
|
|
# text = text[:-1]+"。" if len(text)>0 else text
|
|
# text = text[:-1]+"。" if len(text)>0 else text
|
|
return text
|
|
return text
|
|
-
|
|
|
|
|
|
+
|
|
|
|
+ def get_table_text_kv(inner_table, head_list, key_direct=False):
|
|
|
|
+ packPattern = "(标包|标的|标项|品目|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))" # 2020/11/23 大网站规则,补充采购类包名
|
|
|
|
+ rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见|评标情况|推荐顺序|选取(情况|说明))" # 2020/11/23 大网站规则,添加序号为排序
|
|
|
|
+ entityPattern = "((候选|[中投]标|报价)(单位|公司|人|供应商))|供应商名称"
|
|
|
|
+ moneyPattern = "([中投]标|报价)(金额|价)"
|
|
|
|
+ width = len(inner_table[0])
|
|
|
|
+ text = ""
|
|
|
|
+
|
|
|
|
+ all_table_occurence = []
|
|
|
|
+ for head_i in range(len(head_list) - 1):
|
|
|
|
+ head_begin = head_list[head_i]
|
|
|
|
+ head_end = head_list[head_i + 1]
|
|
|
|
+ direct = getDirect(inner_table, head_begin, head_end)
|
|
|
|
+ # print(inner_table[head_begin:head_end])
|
|
|
|
+ # print('direct', direct)
|
|
|
|
+
|
|
|
|
+ # 构建一个共现矩阵
|
|
|
|
+ table_occurence = []
|
|
|
|
+ for i in range(head_begin, head_end):
|
|
|
|
+ line_oc = []
|
|
|
|
+ for j in range(width):
|
|
|
|
+ cell = inner_table[i][j]
|
|
|
|
+ line_oc.append(
|
|
|
|
+ {"text": cell[0], "type": cell[1], "occu_count": 0, "left_head": "", "top_head": "",
|
|
|
|
+ "left_dis": 0, "top_dis": 0,
|
|
|
|
+ "text_row_index": i, "text_col_index": j
|
|
|
|
+ })
|
|
|
|
+ table_occurence.append(line_oc)
|
|
|
|
+ occu_height = len(table_occurence)
|
|
|
|
+ occu_width = len(table_occurence[0]) if len(table_occurence) > 0 else 0
|
|
|
|
+
|
|
|
|
+ # 为每个属性值寻找表头
|
|
|
|
+ for i in range(occu_height):
|
|
|
|
+ for j in range(occu_width):
|
|
|
|
+ cell = table_occurence[i][j]
|
|
|
|
+ # 是属性值
|
|
|
|
+ if cell["type"] == 0 and cell["text"] != "":
|
|
|
|
+ left_head = ""
|
|
|
|
+ top_head = ""
|
|
|
|
+ find_flag = False
|
|
|
|
+ temp_head = ""
|
|
|
|
+ head_row_col_list = []
|
|
|
|
+ for loop_i in range(1, i + 1):
|
|
|
|
+ if not key_direct:
|
|
|
|
+ key_values = [1, 2]
|
|
|
|
+ else:
|
|
|
|
+ key_values = [1]
|
|
|
|
+ if table_occurence[i - loop_i][j]["type"] in key_values:
|
|
|
|
+ if find_flag:
|
|
|
|
+ if table_occurence[i - loop_i][j]["text"] != temp_head:
|
|
|
|
+ if cell.get("top_head_list"):
|
|
|
|
+ cell["top_head_list"] += [table_occurence[i - loop_i][j]["text"] + ":"]
|
|
|
|
+ else:
|
|
|
|
+ cell["top_head_list"] = [table_occurence[i - loop_i][j]["text"] + ":"]
|
|
|
|
+ top_head = table_occurence[i - loop_i][j]["text"] + ":" + top_head
|
|
|
|
+ head_row_col_list.append([i - loop_i, j])
|
|
|
|
+ else:
|
|
|
|
+ if cell.get("top_head_list"):
|
|
|
|
+ cell["top_head_list"] += [table_occurence[i - loop_i][j]["text"] + ":"]
|
|
|
|
+ else:
|
|
|
|
+ cell["top_head_list"] = [table_occurence[i - loop_i][j]["text"] + ":"]
|
|
|
|
+ top_head = table_occurence[i - loop_i][j]["text"] + ":" + top_head
|
|
|
|
+ head_row_col_list.append([i - loop_i, j])
|
|
|
|
+ find_flag = True
|
|
|
|
+ temp_head = table_occurence[i - loop_i][j]["text"]
|
|
|
|
+ table_occurence[i - loop_i][j]["occu_count"] += 1
|
|
|
|
+ else:
|
|
|
|
+ # 找到表头后遇到属性值就返回
|
|
|
|
+ if find_flag:
|
|
|
|
+ break
|
|
|
|
+ cell["top_head"] += top_head
|
|
|
|
+ if cell.get("top_head_row_index"):
|
|
|
|
+ cell["top_head_row_index"] += [x[0] for x in head_row_col_list]
|
|
|
|
+ else:
|
|
|
|
+ cell["top_head_row_index"] = [x[0] for x in head_row_col_list]
|
|
|
|
+ if cell.get("top_head_col_index"):
|
|
|
|
+ cell["top_head_col_index"] += [x[1] for x in head_row_col_list]
|
|
|
|
+ else:
|
|
|
|
+ cell["top_head_col_index"] = [x[1] for x in head_row_col_list]
|
|
|
|
+ find_flag = False
|
|
|
|
+ temp_head = ""
|
|
|
|
+ head_row_col_list = []
|
|
|
|
+ for loop_j in range(1, j + 1):
|
|
|
|
+ if not key_direct:
|
|
|
|
+ key_values = [1, 2]
|
|
|
|
+ else:
|
|
|
|
+ key_values = [2]
|
|
|
|
+ if table_occurence[i][j - loop_j]["type"] in key_values:
|
|
|
|
+ if find_flag:
|
|
|
|
+ if table_occurence[i][j - loop_j]["text"] != temp_head:
|
|
|
|
+ if cell.get("left_head_list"):
|
|
|
|
+ cell["left_head_list"] += [table_occurence[i][j - loop_j]["text"] + ":"]
|
|
|
|
+ else:
|
|
|
|
+ cell["left_head_list"] = [table_occurence[i][j - loop_j]["text"] + ":"]
|
|
|
|
+ left_head = table_occurence[i][j - loop_j]["text"] + ":" + left_head
|
|
|
|
+ head_row_col_list.append([i, j - loop_j])
|
|
|
|
+ else:
|
|
|
|
+ if cell.get("left_head_list"):
|
|
|
|
+ cell["left_head_list"] += [table_occurence[i][j - loop_j]["text"] + ":"]
|
|
|
|
+ else:
|
|
|
|
+ cell["left_head_list"] = [table_occurence[i][j - loop_j]["text"] + ":"]
|
|
|
|
+ left_head = table_occurence[i][j - loop_j]["text"] + ":" + left_head
|
|
|
|
+ head_row_col_list.append([i, j - loop_j])
|
|
|
|
+ find_flag = True
|
|
|
|
+ temp_head = table_occurence[i][j - loop_j]["text"]
|
|
|
|
+ table_occurence[i][j - loop_j]["occu_count"] += 1
|
|
|
|
+ else:
|
|
|
|
+ if find_flag:
|
|
|
|
+ break
|
|
|
|
+ cell["left_head"] += left_head
|
|
|
|
+ if cell.get("left_head_row_index"):
|
|
|
|
+ cell["left_head_row_index"] += [x[0] for x in head_row_col_list]
|
|
|
|
+ else:
|
|
|
|
+ cell["left_head_row_index"] = [x[0] for x in head_row_col_list]
|
|
|
|
+ if cell.get("left_head_col_index"):
|
|
|
|
+ cell["left_head_col_index"] += [x[1] for x in head_row_col_list]
|
|
|
|
+ else:
|
|
|
|
+ cell["left_head_col_index"] = [x[1] for x in head_row_col_list]
|
|
|
|
+ # 连接表头和属性值
|
|
|
|
+ if direct == "row":
|
|
|
|
+ for i in range(occu_height):
|
|
|
|
+ pack_text = ""
|
|
|
|
+ rank_text = ""
|
|
|
|
+ entity_text = ""
|
|
|
|
+ text_line = ""
|
|
|
|
+ money_text = ""
|
|
|
|
+ # 在同一句话中重复的可以去掉
|
|
|
|
+ text_set = set()
|
|
|
|
+ head = ""
|
|
|
|
+ last_text = ""
|
|
|
|
+ pack_text_location = []
|
|
|
|
+ rank_text_location = []
|
|
|
|
+ entity_text_location = []
|
|
|
|
+ text_line_location = []
|
|
|
|
+ money_text_location = []
|
|
|
|
+ for j in range(width):
|
|
|
|
+ cell = table_occurence[i][j]
|
|
|
|
+ if cell["type"] == 0 or (cell["type"] == 1 and cell["occu_count"] == 0):
|
|
|
|
+ cell = table_occurence[i][j]
|
|
|
|
+ head = (cell["top_head"] + ":") if len(cell["top_head"]) > 0 else ""
|
|
|
|
+ now_top_head = copy.deepcopy(head)
|
|
|
|
+ now_left_head = copy.deepcopy(cell["left_head"])
|
|
|
|
+ if re.search(
|
|
|
|
+ "[单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人|[利费]率|负责人|工期|服务(期限?|年限|时间|日期|周期)|("
|
|
|
|
+ "履约|履行)期限|合同(期限?|(完成|截止)(日期|时间))",
|
|
|
|
+ head):
|
|
|
|
+ head = cell["left_head"] + head
|
|
|
|
+ left_first = 1
|
|
|
|
+ else:
|
|
|
|
+ head += cell["left_head"]
|
|
|
|
+ left_first = 0
|
|
|
|
+ # print('len(text), len(sub_text), len(head)', cell["text"], len(text), len(sub_text), len(head))
|
|
|
|
+ # print('text111', text)
|
|
|
|
+ # print('pack_text, rank_text, entity_text, money_text, text_line', '1'+pack_text, '2'+rank_text, '3'+entity_text, '4'+money_text, '5'+text_line)
|
|
|
|
+ # print('head', head)
|
|
|
|
+ # print('sub_text111', sub_text)
|
|
|
|
+
|
|
|
|
+ if str(head + cell["text"]) in text_set:
|
|
|
|
+ cell['drop'] = 1
|
|
|
|
+ continue
|
|
|
|
+ if re.search(packPattern, head) is not None:
|
|
|
|
+ pack_text += head + cell["text"] + ","
|
|
|
|
+ pack_text_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
|
|
|
|
+ # 2020/11/23 大网站规则发现问题,if 改elif 20240620修复同时有排名及评标情况造成错误
|
|
|
|
+ elif re.search(rankPattern, head) is not None and re.search('(排名|排序|名次|顺序):?第?[\d一二三]', rank_text) is None:
|
|
|
|
+ # 排名替换为同一种表达
|
|
|
|
+ rank_text += head + cell["text"] + ","
|
|
|
|
+ rank_text_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
|
|
|
|
+ elif re.search(entityPattern, head) is not None:
|
|
|
|
+ entity_text += head + cell["text"] + ","
|
|
|
|
+ entity_text_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
|
|
|
|
+ else:
|
|
|
|
+ if re.search(moneyPattern, head) is not None and entity_text != "":
|
|
|
|
+ money_text += head + cell["text"] + ","
|
|
|
|
+ money_text_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
|
|
|
|
+ else:
|
|
|
|
+ text_line += head + cell["text"] + ","
|
|
|
|
+ text_line_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
|
|
|
|
+ text_set.add(str(head + cell["text"]))
|
|
|
|
+ last_text = cell['text']
|
|
|
|
+
|
|
|
|
+ # 计算key value在sentence的index
|
|
|
|
+ head_location_list = pack_text_location + rank_text_location + entity_text_location + text_line_location + money_text_location
|
|
|
|
+ current_loc = 0
|
|
|
|
+ for ii, jj, head_text, now_left_head, now_top_head, left_first in head_location_list:
|
|
|
|
+ cell = table_occurence[ii][jj]
|
|
|
|
+ # 左表头先于右表头
|
|
|
|
+ if left_first:
|
|
|
|
+ cell['left_head_sen_index'] = len(text) + current_loc
|
|
|
|
+ cell['top_head_sen_index'] = len(text) + current_loc + len(now_left_head)
|
|
|
|
+ else:
|
|
|
|
+ cell['left_head_sen_index'] = len(text) + current_loc + len(now_top_head)
|
|
|
|
+ cell['top_head_sen_index'] = len(text) + current_loc
|
|
|
|
+ cell['text_sen_index'] = len(text) + current_loc + len(now_left_head + now_top_head)
|
|
|
|
+ current_loc += len(head_text)
|
|
|
|
+
|
|
|
|
+ tr_text = pack_text + rank_text + entity_text + money_text + text_line
|
|
|
|
+ text += pack_text + rank_text + entity_text + money_text + text_line
|
|
|
|
+
|
|
|
|
+ # 修复367694716分两行表达
|
|
|
|
+ if len(text_set - set([' '])) == 1 and head == '' and len(last_text) < 25:
|
|
|
|
+ text = text if re.search('\w$', text[:-1]) else text[:-1]
|
|
|
|
+ # 修复494731937只有两行的,分句不合理
|
|
|
|
+ elif (width == 2 or len(text_set) == 1) and head != '' and len(tr_text) < 50:
|
|
|
|
+ text = text if re.search('\w$', text[:-1]) else text[:-1]
|
|
|
|
+ else:
|
|
|
|
+ text = text[:-1] + "。"
|
|
|
|
+ else:
|
|
|
|
+ for j in range(occu_width):
|
|
|
|
+ pack_text = ""
|
|
|
|
+ rank_text = ""
|
|
|
|
+ entity_text = ""
|
|
|
|
+ text_line = ""
|
|
|
|
+ text_set = set()
|
|
|
|
+ pack_text_location = []
|
|
|
|
+ rank_text_location = []
|
|
|
|
+ entity_text_location = []
|
|
|
|
+ text_line_location = []
|
|
|
|
+ money_text_location = []
|
|
|
|
+ for i in range(occu_height):
|
|
|
|
+ cell = table_occurence[i][j]
|
|
|
|
+ if cell["type"] == 0 or (cell["type"] == 1 and cell["occu_count"] == 0):
|
|
|
|
+ cell = table_occurence[i][j]
|
|
|
|
+ head = (cell["left_head"] + "") if len(cell["left_head"]) > 0 else ""
|
|
|
|
+ now_top_head = copy.deepcopy(cell["top_head"])
|
|
|
|
+ now_left_head = copy.deepcopy(head)
|
|
|
|
+ if re.search("[单报标限总]价|金额|成交报?价|报价|供应商|候选人|中标人|[利费]率|负责人|工期|服务(期限?|年限|时间|日期|周期)|(履约|履行)期限|合同(期限?|(完成|截止)(日期|时间))", head):
|
|
|
|
+ head = cell["top_head"] + head
|
|
|
|
+ left_first = 0
|
|
|
|
+ else:
|
|
|
|
+ head += cell["top_head"]
|
|
|
|
+ left_first = 1
|
|
|
|
+ if str(head + cell["text"]) in text_set:
|
|
|
|
+ cell['drop'] = 1
|
|
|
|
+ continue
|
|
|
|
+ if re.search(packPattern, head) is not None:
|
|
|
|
+ pack_text += head + cell["text"] + ","
|
|
|
|
+ pack_text_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
|
|
|
|
+ # 2020/11/23 大网站规则发现问题,if 改elif
|
|
|
|
+ elif re.search(rankPattern, head) is not None:
|
|
|
|
+ # 排名替换为同一种表达
|
|
|
|
+ rank_text += head + cell["text"] + ","
|
|
|
|
+ rank_text_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
|
|
|
|
+ # 2021/10/19 解决包含业绩的行调到前面问题
|
|
|
|
+ elif re.search(entityPattern, head) is not None and \
|
|
|
|
+ re.search('业绩|资格|条件', head) is None and re.search('业绩', cell["text"]) is None:
|
|
|
|
+ entity_text += head + cell["text"] + ","
|
|
|
|
+ entity_text_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
|
|
|
|
+ else:
|
|
|
|
+ text_line += head + cell["text"] + ","
|
|
|
|
+ text_line_location += [[i, j, head + cell["text"] + ",", now_left_head, now_top_head, left_first]]
|
|
|
|
+ text_set.add(str(head + cell["text"]))
|
|
|
|
+
|
|
|
|
+ # 计算key value在sentence的index
|
|
|
|
+ head_location_list = pack_text_location + rank_text_location + entity_text_location + text_line_location + money_text_location
|
|
|
|
+ current_loc = 0
|
|
|
|
+ for ii, jj, head_text, now_left_head, now_top_head, left_first in head_location_list:
|
|
|
|
+ cell = table_occurence[ii][jj]
|
|
|
|
+ # 左表头先于右表头
|
|
|
|
+ if left_first:
|
|
|
|
+ cell['left_head_sen_index'] = len(text) + current_loc
|
|
|
|
+ cell['top_head_sen_index'] = len(text) + current_loc + len(now_left_head)
|
|
|
|
+ else:
|
|
|
|
+ cell['left_head_sen_index'] = len(text) + current_loc + len(now_top_head)
|
|
|
|
+ cell['top_head_sen_index'] = len(text) + current_loc
|
|
|
|
+ cell['text_sen_index'] = len(text) + current_loc + len(now_left_head + now_top_head)
|
|
|
|
+ current_loc += len(head_text)
|
|
|
|
+
|
|
|
|
+ text += pack_text + rank_text + entity_text + text_line
|
|
|
|
+ text = text[:-1] + "。" if len(text) > 0 else text
|
|
|
|
+ all_table_occurence += table_occurence
|
|
|
|
+ return text, all_table_occurence
|
|
|
|
+
|
|
|
|
+ def process_dict(text, table):
|
|
|
|
+ kv_list = []
|
|
|
|
+ kv_dict_list = []
|
|
|
|
+ # print('text', len(text), text, ),
|
|
|
|
+ # print('table', table)
|
|
|
|
+ for r_index, row in enumerate(table):
|
|
|
|
+ for c_index, col in enumerate(row):
|
|
|
|
+ # print('col', col)
|
|
|
|
+
|
|
|
|
+ if col['type'] == 1:
|
|
|
|
+ continue
|
|
|
|
+ if col.get('drop'):
|
|
|
|
+ continue
|
|
|
|
+ if not col.get('left_head_list') and not col.get('top_head_list'):
|
|
|
|
+ _d = {
|
|
|
|
+ 'value': col['text'],
|
|
|
|
+ 'value_row_index': col['text_row_index'],
|
|
|
|
+ 'value_col_index': col['text_col_index'],
|
|
|
|
+ 'value_sen_index': col['text_sen_index'],
|
|
|
|
+ 'sen_value': text[col['text_sen_index']:col['text_sen_index'] + len(col['text'])],
|
|
|
|
+ }
|
|
|
|
+ kv_dict_list.append(_d)
|
|
|
|
+ continue
|
|
|
|
+ if col.get('text_sen_index') and col.get('text_sen_index') >= len(text):
|
|
|
|
+ # print('continue1')
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ if col.get('left_head_list'):
|
|
|
|
+ # head, head_row_index, head_col_index 按文本顺序排序
|
|
|
|
+ zip_list = list(
|
|
|
|
+ zip(col.get('left_head_list'), col.get('left_head_row_index'), col.get('left_head_col_index')))
|
|
|
|
+ zip_list.sort(key=lambda x: (x[1], x[2]))
|
|
|
|
+ col['left_head_list'], col['left_head_row_index'], col['left_head_col_index'] = zip(*zip_list)
|
|
|
|
+
|
|
|
|
+ last_head = ""
|
|
|
|
+ for h_index, head in enumerate(col.get('left_head_list')):
|
|
|
|
+ _d = {
|
|
|
|
+ 'key': head,
|
|
|
|
+ 'value': col['text'],
|
|
|
|
+ 'key_row_index': col['left_head_row_index'][h_index],
|
|
|
|
+ 'key_col_index': col['left_head_col_index'][h_index],
|
|
|
|
+ 'key_sen_index': col['left_head_sen_index'] + len(last_head),
|
|
|
|
+ 'value_row_index': col['text_row_index'],
|
|
|
|
+ 'value_col_index': col['text_col_index'],
|
|
|
|
+ 'value_sen_index': col['text_sen_index'],
|
|
|
|
+ 'sen_key': text[
|
|
|
|
+ col['left_head_sen_index'] + len(last_head):col['left_head_sen_index'] + len(
|
|
|
|
+ last_head) + len(head)],
|
|
|
|
+ 'sen_value': text[col['text_sen_index']:col['text_sen_index'] + len(col['text'])],
|
|
|
|
+ }
|
|
|
|
+ kv_dict_list.append(_d)
|
|
|
|
+ last_head += head
|
|
|
|
+
|
|
|
|
+ if col.get('top_head_list'):
|
|
|
|
+ # head, head_row_index, head_col_index 按文本顺序排序
|
|
|
|
+ zip_list = list(
|
|
|
|
+ zip(col.get('top_head_list'), col.get('top_head_row_index'), col.get('top_head_col_index')))
|
|
|
|
+ zip_list.sort(key=lambda x: (x[1], x[2]))
|
|
|
|
+ col['top_head_list'], col['top_head_row_index'], col['top_head_col_index'] = zip(*zip_list)
|
|
|
|
+
|
|
|
|
+ last_head = ""
|
|
|
|
+ for h_index, head in enumerate(col.get('top_head_list')):
|
|
|
|
+ _d = {
|
|
|
|
+ 'key': head,
|
|
|
|
+ 'value': col['text'],
|
|
|
|
+ 'key_row_index': col['top_head_row_index'][h_index],
|
|
|
|
+ 'key_col_index': col['top_head_col_index'][h_index],
|
|
|
|
+ 'key_sen_index': col['top_head_sen_index'] + len(last_head),
|
|
|
|
+ 'value_row_index': col['text_row_index'],
|
|
|
|
+ 'value_col_index': col['text_col_index'],
|
|
|
|
+ 'value_sen_index': col['text_sen_index'],
|
|
|
|
+ 'sen_key': text[col['top_head_sen_index'] + len(last_head):col['top_head_sen_index'] + len(
|
|
|
|
+ last_head) + len(head)],
|
|
|
|
+ 'sen_value': text[col['text_sen_index']:col['text_sen_index'] + len(col['text'])],
|
|
|
|
+ }
|
|
|
|
+ kv_dict_list.append(_d)
|
|
|
|
+ last_head += head
|
|
|
|
+ return kv_list, kv_dict_list
|
|
|
|
+
|
|
def removeFix(inner_table,fix_value="~~"):
|
|
def removeFix(inner_table,fix_value="~~"):
|
|
height = len(inner_table)
|
|
height = len(inner_table)
|
|
width = len(inner_table[0])
|
|
width = len(inner_table[0])
|
|
@@ -1319,14 +1671,22 @@ def tableToText(soup, docid=None):
|
|
table_max_len = 30000
|
|
table_max_len = 30000
|
|
tbody.string = tbody.string[:table_max_len]
|
|
tbody.string = tbody.string[:table_max_len]
|
|
tbody.name = "turntable"
|
|
tbody.name = "turntable"
|
|
|
|
+ if return_kv:
|
|
|
|
+ return None, None, None
|
|
return None
|
|
return None
|
|
# fixSpan(tbody)
|
|
# fixSpan(tbody)
|
|
# inner_table = getTable(tbody)
|
|
# inner_table = getTable(tbody)
|
|
# inner_table = fixTable(inner_table)
|
|
# inner_table = fixTable(inner_table)
|
|
|
|
|
|
table2list = TableTag2List()
|
|
table2list = TableTag2List()
|
|
- inner_table = table2list.table2list(tbody, segment)
|
|
|
|
- inner_table = fixTable(inner_table)
|
|
|
|
|
|
+ return_html_table = True if return_kv else False
|
|
|
|
+ if return_html_table:
|
|
|
|
+ inner_table, html_table = table2list.table2list(tbody, segment, return_html_table)
|
|
|
|
+ inner_table = fixTable(inner_table)
|
|
|
|
+ html_table = fixTable(html_table, "")
|
|
|
|
+ else:
|
|
|
|
+ inner_table = table2list.table2list(tbody, segment)
|
|
|
|
+ inner_table = fixTable(inner_table)
|
|
|
|
|
|
if inner_table == []:
|
|
if inner_table == []:
|
|
string_list = [re.sub("\s+", "", i) for i in tbody.strings if i and i != '\n']
|
|
string_list = [re.sub("\s+", "", i) for i in tbody.strings if i and i != '\n']
|
|
@@ -1335,6 +1695,8 @@ def tableToText(soup, docid=None):
|
|
tbody.string = tbody.string[:table_max_len]
|
|
tbody.string = tbody.string[:table_max_len]
|
|
# log('异常表格直接取全文')
|
|
# log('异常表格直接取全文')
|
|
tbody.name = "turntable"
|
|
tbody.name = "turntable"
|
|
|
|
+ if return_kv:
|
|
|
|
+ return None, None, None
|
|
return None
|
|
return None
|
|
|
|
|
|
if len(inner_table)>0 and len(inner_table[0])>0:
|
|
if len(inner_table)>0 and len(inner_table[0])>0:
|
|
@@ -1347,6 +1709,8 @@ def tableToText(soup, docid=None):
|
|
tbody.string = tbody.string[:table_max_len]
|
|
tbody.string = tbody.string[:table_max_len]
|
|
# log('异常表格,不做表格处理,直接取全文')
|
|
# log('异常表格,不做表格处理,直接取全文')
|
|
tbody.name = "turntable"
|
|
tbody.name = "turntable"
|
|
|
|
+ if return_kv:
|
|
|
|
+ return None, None, None
|
|
return None
|
|
return None
|
|
|
|
|
|
#inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
|
|
#inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
|
|
@@ -1368,12 +1732,28 @@ def tableToText(soup, docid=None):
|
|
# for item in inner_table:
|
|
# for item in inner_table:
|
|
# print(item)
|
|
# print(item)
|
|
|
|
|
|
- tbody.string = getTableText(inner_table,head_list)
|
|
|
|
|
|
+ # print('inner_table111', inner_table)
|
|
|
|
+
|
|
|
|
+ if return_kv:
|
|
|
|
+ text, table = get_table_text_kv(inner_table, head_list)
|
|
|
|
+ kv_list, kv_dict_list = process_dict(text, table)
|
|
|
|
+ tbody.string = text
|
|
|
|
+ # html放入dict
|
|
|
|
+ for kv_dict in kv_dict_list:
|
|
|
|
+ html = html_table[kv_dict.get('value_row_index')][kv_dict.get('value_col_index')]
|
|
|
|
+ kv_dict['value_html'] = html
|
|
|
|
+ else:
|
|
|
|
+ tbody.string = getTableText(inner_table,head_list)
|
|
table_max_len = 30000
|
|
table_max_len = 30000
|
|
tbody.string = tbody.string[:table_max_len]
|
|
tbody.string = tbody.string[:table_max_len]
|
|
# print(tbody.string)
|
|
# print(tbody.string)
|
|
tbody.name = "turntable"
|
|
tbody.name = "turntable"
|
|
- return inner_table
|
|
|
|
|
|
+ if return_kv:
|
|
|
|
+ return inner_table, kv_dict_list, text
|
|
|
|
+ else:
|
|
|
|
+ return inner_table
|
|
|
|
+ if return_kv:
|
|
|
|
+ return None, None, None
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
|
|
@@ -1404,6 +1784,10 @@ def tableToText(soup, docid=None):
|
|
elif _part.name=='div':
|
|
elif _part.name=='div':
|
|
if 'class' in _part.attrs and "richTextFetch" in _part['class']:
|
|
if 'class' in _part.attrs and "richTextFetch" in _part['class']:
|
|
in_attachment = True
|
|
in_attachment = True
|
|
|
|
+
|
|
|
|
+ if return_kv and tbodies:
|
|
|
|
+ tbodies = tbodies[:1]
|
|
|
|
+
|
|
#逆序处理嵌套表格
|
|
#逆序处理嵌套表格
|
|
# print('len(tbodies)1', len(tbodies))
|
|
# print('len(tbodies)1', len(tbodies))
|
|
# for tbody_index in range(1,len(tbodies)+1):
|
|
# for tbody_index in range(1,len(tbodies)+1):
|
|
@@ -1436,6 +1820,10 @@ def tableToText(soup, docid=None):
|
|
elif _part.name == 'div':
|
|
elif _part.name == 'div':
|
|
if 'class' in _part.attrs and "richTextFetch" in _part['class']:
|
|
if 'class' in _part.attrs and "richTextFetch" in _part['class']:
|
|
in_attachment = True
|
|
in_attachment = True
|
|
|
|
+
|
|
|
|
+ if return_kv and tbodies:
|
|
|
|
+ tbodies = tbodies[:1]
|
|
|
|
+
|
|
#逆序处理嵌套表格
|
|
#逆序处理嵌套表格
|
|
tbody_index = 1
|
|
tbody_index = 1
|
|
# for tbody_index in range(1,len(tbodies)+1):
|
|
# for tbody_index in range(1,len(tbodies)+1):
|
|
@@ -1457,6 +1845,10 @@ def tableToText(soup, docid=None):
|
|
list_innerTable.append(inner_table)
|
|
list_innerTable.append(inner_table)
|
|
tbody_index += 1
|
|
tbody_index += 1
|
|
|
|
|
|
|
|
+ if return_kv:
|
|
|
|
+ kv_list = [x[1] for x in list_innerTable]
|
|
|
|
+ list_innerTable = [x[0] for x in list_innerTable]
|
|
|
|
+ return soup, kv_list
|
|
return soup
|
|
return soup
|
|
# return list_innerTable
|
|
# return list_innerTable
|
|
|
|
|