瀏覽代碼

kv+大纲构建

luojiehua 5 月之前
父節點
當前提交
d6e6bc3656

+ 26 - 9
BiddingKG/dl/interface/Preprocessing.py

@@ -115,9 +115,15 @@ def tableToText(soup, docid=None, return_kv=False):
             tr_line = []
             tds = tr.findChildren(['td','th'], recursive=False)
             if len(tds)==0:
-                tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
+                if return_kv:
+                    tr_line.append([re.sub('\xa0','',tr.get_text()),0])
+                else:
+                    tr_line.append([re.sub('\xa0','',segment(tr,final=False)),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
             for td in tds:
-                tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
+                if return_kv:
+                    tr_line.append([re.sub('\xa0','',td.get_text()),0])
+                else:
+                    tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
                 #tr_line.append([td.get_text(),0])
             inner_table.append(tr_line)
         return inner_table                          
@@ -1681,11 +1687,11 @@ def tableToText(soup, docid=None, return_kv=False):
         table2list = TableTag2List()
         return_html_table = True if return_kv else False
         if return_html_table:
-            inner_table, html_table = table2list.table2list(tbody, segment, return_html_table)
+            inner_table, html_table = table2list.table2list(tbody, segment, return_html_table,return_kv=return_kv)
             inner_table = fixTable(inner_table)
             html_table = fixTable(html_table, "")
         else:
-            inner_table = table2list.table2list(tbody, segment)
+            inner_table = table2list.table2list(tbody, segment,return_kv=return_kv)
             inner_table = fixTable(inner_table)
 
         if inner_table == []:
@@ -1778,6 +1784,8 @@ def tableToText(soup, docid=None, return_kv=False):
     # 遍历表格中的每个tbody
     tbodies = []
     in_attachment = False
+    if soup.name=="table":
+        tbodies.append((soup,in_attachment))
     for _part in soup.find_all():
         if _part.name=='table':
             tbodies.append((_part,in_attachment))
@@ -1803,7 +1811,8 @@ def tableToText(soup, docid=None, return_kv=False):
                 else:
                     tbodies[tbody_index - 1][0].append(row)
             inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
-            list_innerTable.append(inner_table)
+            if inner_table:
+                list_innerTable.append(inner_table)
             tbody_index += 2
             continue
         inner_table = trunTable(tbody,_in_attachment)
@@ -1814,6 +1823,8 @@ def tableToText(soup, docid=None, return_kv=False):
     # 遍历表格中的每个tbody
     tbodies = []
     in_attachment = False
+    if soup.name=="tbody":
+        tbodies.append((soup,in_attachment))
     for _part in soup.find_all():
         if _part.name == 'tbody':
             tbodies.append((_part, in_attachment))
@@ -1838,7 +1849,8 @@ def tableToText(soup, docid=None, return_kv=False):
                 else:
                     tbodies[tbody_index - 1][0].append(row)
             inner_table = trunTable(tbodies[tbody_index - 1][0], _in_attachment)
-            list_innerTable.append(inner_table)
+            if inner_table:
+                list_innerTable.append(inner_table)
             tbody_index += 2
             continue
         inner_table = trunTable(tbody,_in_attachment)
@@ -1846,9 +1858,14 @@ def tableToText(soup, docid=None, return_kv=False):
         tbody_index += 1
 
     if return_kv:
-        kv_list = [x[1] for x in list_innerTable]
-        text = [x[2] for x in list_innerTable]
-        list_innerTable = [x[0] for x in list_innerTable]
+        kv_list = []
+        for x in list_innerTable:
+            if x[1] is not None:
+                kv_list.extend(x[1])
+        text = ""
+        for x in list_innerTable:
+            if x[2] is not None:
+                text += x[2]
         return soup, kv_list, text
     return soup
     # return list_innerTable

+ 709 - 176
BiddingKG/dl/interface/html_2_kv.py → BiddingKG/dl/interface/html_2_kvtree.py

@@ -9,6 +9,8 @@ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(level
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
+from BiddingKG.dl.interface.Preprocessing import tableToText
+from uuid import uuid4
 
 def log(msg):
     '''
@@ -29,6 +31,299 @@ class DotDict(dict):
 
 
 
+def get_tables(soup,dict_table = None):
+    is_first = False
+    if dict_table is None:
+        dict_table = {"children":[]}
+        is_first = True
+    if soup and soup.name:
+        childs = soup.contents
+
+    else:
+        childs = []
+
+    # tr+tbody
+    _flag = False
+    if len(childs)>=2:
+        if childs[0].name=="tr" and childs[1].name=="tbody":
+            childs[1].insert(0,copy.copy(childs[0]))
+            childs[0].decompose()
+            _flag = True
+
+    childs_bak = childs
+    # tbody+tbody
+    _flag = False
+    if soup and soup.name:
+        childs = soup.find_all("tbody",recursive=False)
+        if len(childs)>=2:
+            if childs[0].name=="tbody" and childs[1].name=="tbody":
+                child0_tr = childs[0].find_all("tr",recursive=False)
+                has_td_count = 0
+                tr_line = None
+                for tr in child0_tr:
+                    if len(tr.find_all("td",recursive=False))>0:
+                        has_td_count += 1
+                        tr_line = tr
+                if has_td_count==1:
+                    childs[1].insert(0,copy.copy(tr_line))
+                    childs[0].decompose()
+                    _flag = True
+
+    childs = childs_bak
+    for child in childs:
+        _d = {"children":[]}
+        if child.name in ("table","tbody"):
+            if len(child.find_all("tr",recursive=False))>0:
+                # _d["table"] = str(child)
+                _d["table"] = child
+        dict_table["children"].append(_d)
+        child_dict_table = get_tables(child,_d)
+
+    if is_first:
+        if soup.name in ("table","tbody"):
+            if not _flag:
+                if len(soup.find_all("tr",recursive=False))>0:
+                    # dict_table["table"] = str(soup)
+                    dict_table["table"] = soup
+
+        dict_table = squeeze_tables(dict_table)
+
+    return dict_table
+def squeeze_tables(dict_table):
+    _i = -1
+    new_children = []
+    for child in dict_table["children"]:
+        _i += 1
+        child_table = squeeze_tables(child)
+
+        if child_table is not None:
+            new_children.append(child_table)
+
+    if dict_table.get("table") is not None:
+        if len(new_children)>0:
+            dict_table["children"] = new_children
+        else:
+            del dict_table["children"]
+        return dict_table
+    if len(new_children)==1:
+        return new_children[0]
+    if len(new_children)>1:
+        dict_table["children"] = new_children
+        return dict_table
+
+    return None
+
+
+
+
+
+def table_to_tree(soup,json_obj=None):
+
+
+    if json_obj is None:
+        json_obj = DotDict({"tag": "table","children":[]})
+
+
+    dict_table = get_tables(soup)
+
+    children = dict_table.get("children",[])
+    for child in children:
+        _d = DotDict({"tag": "table","children":[]})
+        json_obj["children"].append(_d)
+        table = child.get("table")
+        if table is not None:
+            table_id = str(uuid4())
+            table_to_tree(table,_d)
+
+    table = dict_table.get("table")
+    if table is not None:
+        table_id = str(uuid4())
+        json_obj["table_id"] = table_id
+        soup, kv_list, text = tableToText(table,return_kv=True)
+        _flag = False
+        if soup and soup.name:
+            if soup.contents:
+                _flag = True
+                soup.contents[0].insert_before(table_id)
+        if not _flag:
+            soup.insert_before(table_id)
+        json_obj["text"] = text
+        json_obj["kv"] = kv_list
+        for _d in kv_list:
+            _d["position"] = {"key_begin_sentence":0,
+                              "key_begin_sentence_start":_d.get("key_sen_index",0),
+                              "key_end_sentence":0,
+                              "key_end_sentence_end":_d.get("key_sen_index",0)+len(_d.get("key","")),
+                              "value_begin_sentence":0,
+                              "value_begin_sentence_start":_d.get("value_sen_index",0),
+                              "value_end_sentence":0,
+                              "value_end_sentence_end":_d.get("value_sen_index",0)+len(_d.get("value",""))
+                              }
+            if "key_sen_index" in _d:
+                _d.pop("key_sen_index")
+            if "value_sen_index" in _d:
+                _d.pop("value_sen_index")
+    return json_obj
+
+
+def update_table_position(table,sentence_index):
+
+
+    def get_table_idx_lengths(list_table_id,index):
+        _length = 0
+        for _d in list_table_id:
+            table_id = _d.get("table_id")
+            idx = _d.get("idx",-1)
+
+            if idx>=0 and _idx<=index:
+                _length += len(table_id)
+        return _length
+
+    def get_sentence_index(list_sent_span,idx):
+        list_sent_span.sort(key=lambda x:x[0])
+        for _i in range(len(list_sent_span)):
+            if list_sent_span[_i][0]<=idx and idx<=list_sent_span[_i][1]:
+                return _i
+        return 0
+
+    def get_list_tables(table,list_table=[]):
+
+        table_id = table.get("table_id")
+        if table_id:
+            list_table.append(table)
+        childs = table.get("children",[])
+        for child in childs:
+            get_list_tables(child,list_table)
+        return list_table
+
+    tables = get_list_tables(table)
+    if tables:
+
+        list_table_id = []
+        text = tables[0].get("text","")
+
+        for table in tables:
+            table_id = table.get("table_id")
+
+            if table_id:
+                _idx = text.find(table_id)
+                list_table_id.append({"table_id":table_id,"idx":_idx})
+                if _idx>=0:
+                    kv_list = table.get("kv",[])
+                    for _d in kv_list:
+                        _d["position"]["key_begin_sentence_start"] += _idx
+                        _d["position"]["key_end_sentence_end"] += _idx
+                        _d["position"]["value_begin_sentence_start"] += _idx
+                        _d["position"]["value_end_sentence_end"] += _idx
+
+        # remove table_id
+        for table in tables:
+            table_id = table.get("table_id")
+
+            if table_id:
+                kv_list = table.get("kv",[])
+                for _d in kv_list:
+                    _length = get_table_idx_lengths(list_table_id,_d["position"]["key_begin_sentence_start"])
+                    _d["position"]["key_begin_sentence_start"] -= _length
+                    _length = get_table_idx_lengths(list_table_id,_d["position"]["key_end_sentence_end"])
+                    _d["position"]["key_end_sentence_end"] -= _length
+                    _length = get_table_idx_lengths(list_table_id,_d["position"]["value_begin_sentence_start"])
+                    _d["position"]["value_begin_sentence_start"] -= _length
+                    _length = get_table_idx_lengths(list_table_id,_d["position"]["value_end_sentence_end"])
+                    _d["position"]["value_end_sentence_end"] -= _length
+
+        for table in tables:
+            if table.get("table_id"):
+                text = table.get("text","")
+                for _d in list_table_id:
+                    table_id = _d.get("table_id")
+                    text = text.replace(table_id,"")
+                table["text"] = text
+
+        # split sentence
+        text = tables[0].get("text","")
+        list_sentence = str(text).split("。")
+        list_sent_span = []
+        _begin = 0
+        for _i in range(len(list_sentence)):
+            list_sentence[_i] += "。"
+            _end = _begin+len(list_sentence[_i])
+            list_sent_span.append([_begin,_end])
+            _begin = _end
+        tables[0]["sentences"] = list_sentence
+
+        for table in tables:
+
+            kv_list = table.get("kv",[])
+            for _d in kv_list:
+                key_begin_sentence = get_sentence_index(list_sent_span,_d["position"]["key_begin_sentence_start"])
+                _d["position"]["key_begin_sentence"] = key_begin_sentence+sentence_index
+                key_end_sentence = get_sentence_index(list_sent_span,_d["position"]["key_end_sentence_end"])
+                _d["position"]["key_end_sentence"] = key_end_sentence+sentence_index
+                value_begin_sentence = get_sentence_index(list_sent_span,_d["position"]["value_begin_sentence_start"])
+                _d["position"]["value_begin_sentence"] = value_begin_sentence+sentence_index
+                value_end_sentence = get_sentence_index(list_sent_span,_d["position"]["value_end_sentence_end"])
+                _d["position"]["value_end_sentence"] = value_end_sentence+sentence_index
+
+        return sentence_index + len(list_sentence)
+    return sentence_index
+
+
+
+
+
+
+
+
+
+
+
+
+def tree_reposition(tree,sentence_index=None):
+    if sentence_index is None:
+        sentence_index = 0
+
+    wordOffset_begin = 0
+    wordOffset_end = 0
+    for obj in tree:
+        is_table = True if obj.get("tag","")=="table" else False
+        if not is_table:
+            sentence_index += 1
+            obj["sentence_index"] = sentence_index
+            obj["sentences"] = [obj.get("text","")]
+            for _t in obj["sentences"]:
+                wordOffset_end += len(_t)
+            obj["wordOffset_begin"] = wordOffset_begin
+            obj["wordOffset_end"] = wordOffset_end
+            wordOffset_begin = wordOffset_end
+
+        else:
+            sentence_index += 1
+            obj["sentence_index"] = sentence_index
+            obj["sentence_index_start"] = sentence_index
+            obj["sentences"] = [obj.get("text","")]
+            sentence_index_end = update_table_position(obj,sentence_index)
+            obj["sentence_index_end"] = sentence_index_end
+            sentence_index = sentence_index_end
+
+            for _t in obj["sentences"]:
+                wordOffset_end += len(_t)
+            obj["wordOffset_begin"] = wordOffset_begin
+            obj["wordOffset_end"] = wordOffset_end
+            wordOffset_begin = wordOffset_end
+
+
+
+
+
+
+
+
+
+
+
+
+
 # 递归地将 DOM 转换为 JSON
 # 递归地将 DOM 转换为 JSON
 def dom_to_tree(node):
@@ -36,17 +331,24 @@ def dom_to_tree(node):
         json_obj = DotDict({"tag": node.name})
         if node.attrs:
             json_obj["attributes"] = node.attrs
-        children = []
-        for child in node.contents:
-            _child = dom_to_tree(child)
-            if _child is not None:
-                _child["parent"] = json_obj
-                children.append(_child)
-        if children:
-            json_obj["children"] = children
+
+        is_table = False
+        if node.name in ("table","tbody"):
+            json_obj = table_to_tree(node)
+            is_table = True
+
+        if not is_table:
+            children = []
+            for child in node.contents:
+                _child = dom_to_tree(child)
+                if _child is not None:
+                    children.append(_child)
+            if children:
+                json_obj["children"] = children
+        json_obj["name"] = json_obj.get("tag")
         return json_obj
     elif node.string and node.string.strip():  # 如果是纯文本节点
-        return DotDict({"tag":"text","text": node.string.strip()})
+        return DotDict({"tag":"text","name":"text","text": node.string.strip()})
     return None  # 忽略空白字符
 
 def tree_pop_parent(tree):
@@ -65,13 +367,15 @@ def html_to_tree(html_content):
     # 使用 BeautifulSoup 解析 HTML
     soup = BeautifulSoup(html_content, "lxml")
     dom_tree = dom_to_tree(soup)
+    extract_kv_from_tree(dom_tree)
+    list_objs = get_outobjs_from_tree(dom_tree)
+    tree_reposition(list_objs)
     return dom_tree
 
 def print_tree(dom_tree):
     # 转换为 JSON 格式
     tree_pop_parent(dom_tree)
     json_output = json.dumps(dom_tree,ensure_ascii=False, indent=2)
-    print(json_output)
 
 # kv_pattern = "\s*(?P<key>.{,10})[::]\s*(?P<value>[^::。,()]+?)(\s+|$|;|;)(?![\u4e00-\u9fa5]+:)"
 kv_pattern = r"(?P<key>[\u4e00-\u9fa5]+):\s*(?P<value>[^\s,。();;]+)"
@@ -93,6 +397,7 @@ def get_kv_pattern():
     for match in matches:
         key, value = match
         print("{%s}: {%s}"%(key,value))
+
 def extract_kv_from_sentence(sentence):
     list_kv = []
     _iter = re.finditer("[::]", sentence)
@@ -103,7 +408,16 @@ def extract_kv_from_sentence(sentence):
         if len(list_span)==1:
             _begin,_end = list_span[0]
             if _begin<20 and _end<len(sentence)-1:
-                _d = DotDict({"key":sentence[0:_begin],"value":sentence[_end:],"key_span":[0,_begin],"value_span":[_end,len(sentence)]})
+                _d = DotDict({"key":sentence[0:_begin],"value":sentence[_end:]})
+                _d["position"] = {"key_begin_sentence":0,
+                              "key_begin_sentence_start":0,
+                              "key_end_sentence":0,
+                              "key_end_sentence_end":_begin,
+                              "value_begin_sentence":0,
+                              "value_begin_sentence_start":_end,
+                              "value_end_sentence":0,
+                              "value_end_sentence_end":len(sentence)
+                              }
                 list_kv.append(_d)
             else:
                 _begin = 0
@@ -111,10 +425,19 @@ def extract_kv_from_sentence(sentence):
                 iter = re.search(kv_pattern,sentence[_begin:_end])
                 if iter is not None:
                     _d = DotDict({})
+
                     _d["key"] = iter.group("key")
                     _d["value"] = iter.group("value")
-                    _d["key_span"] = iter.span("key")
-                    _d["value_span"] = iter.span("value")
+
+                    _d["position"] = {"key_begin_sentence":0,
+                              "key_begin_sentence_start":iter.span("key")[0],
+                              "key_end_sentence":0,
+                              "key_end_sentence_end":iter.span("key")[0]+len(_d.get("key","")),
+                              "value_begin_sentence":0,
+                              "value_begin_sentence_start":iter.span("value")[0],
+                              "value_end_sentence":0,
+                              "value_end_sentence_end":iter.span("value")[0]+len(_d.get("value",""))
+                              }
                     list_kv.append(_d)
 
         elif len(list_span)>1:
@@ -128,8 +451,16 @@ def extract_kv_from_sentence(sentence):
                     _d = DotDict({})
                     _d["key"] = iter.group("key")
                     _d["value"] = iter.group("value")
-                    _d["key_span"] = iter.span("key")
-                    _d["value_span"] = iter.span("value")
+
+                    _d["position"] = {"key_begin_sentence":0,
+                              "key_begin_sentence_start":iter.span("key")[0],
+                              "key_end_sentence":0,
+                              "key_end_sentence_end":iter.span("key")[0]+len(_d.get("key","")),
+                              "value_begin_sentence":0,
+                              "value_begin_sentence_start":iter.span("value")[0],
+                              "value_end_sentence":0,
+                              "value_end_sentence_end":iter.span("value")[0]+len(_d.get("value",""))
+                              }
                     list_kv.append(_d)
 
             _begin = list_span[-2][1]
@@ -139,8 +470,16 @@ def extract_kv_from_sentence(sentence):
                 _d = DotDict({})
                 _d["key"] = iter.group("key")
                 _d["value"] = iter.group("value")
-                _d["key_span"] = iter.span("key")
-                _d["value_span"] = iter.span("value")
+
+                _d["position"] = {"key_begin_sentence":0,
+                              "key_begin_sentence_start":iter.span("key")[0],
+                              "key_end_sentence":0,
+                              "key_end_sentence_end":iter.span("key")[0]+len(_d.get("key","")),
+                              "value_begin_sentence":0,
+                              "value_begin_sentence_start":iter.span("value")[0],
+                              "value_end_sentence":0,
+                              "value_end_sentence_end":iter.span("value")[0]+len(_d.get("value",""))
+                              }
                 list_kv.append(_d)
 
 
@@ -173,75 +512,90 @@ def get_child_text(node):
 def extract_kv_from_tree(tree):
     if isinstance(tree,list):
         _count = 0
+        has_table = False
         for child in tree:
-            _count += extract_kv_from_tree(child)
-        return _count
+            _c,_t = extract_kv_from_tree(child)
+            _count += _c
+            if _t:
+                has_table = _t
+        return _count,has_table
     if isinstance(tree,dict):
-        childs = tree.get("children",[])
-        if len(childs)>0:
-            _count = 0
-            for child in childs:
-                _count += extract_kv_from_tree(child)
-            if _count==0:
-                _text = get_child_text(tree)
-                if "children" in tree:
-                    del tree["children"]
-                tree["text"] = _text
-                list_kv = extract_kv_from_node(tree)
-                _count = len(list_kv)
-                return _count
-            if tree.get("tag","")=="p":
-                _text = get_child_text(tree)
-                tree["text"] = _text
-                p_list_kv = extract_kv_from_node(tree)
-                if len(p_list_kv)>=_count:
+        if tree.get("tag","")!="table":
+            childs = tree.get("children",[])
+            if len(childs)>0:
+                _count = 0
+                has_table = False
+                for child in childs:
+                    _c,_t = extract_kv_from_tree(child)
+                    _count += _c
+                    if _t:
+                        has_table = _t
+                if _count==0:
+                    _text = get_child_text(tree)
                     if "children" in tree:
                         del tree["children"]
-                else:
-                    tree["text"] = ""
-                return len(p_list_kv)
+                    tree["text"] = _text
+                    list_kv = extract_kv_from_node(tree)
+                    _count = len(list_kv)
+                    return _count,has_table
+                if tree.get("tag","")=="p" and not has_table:
+                    _text = get_child_text(tree)
+                    tree["text"] = _text
+                    p_list_kv = extract_kv_from_node(tree)
+                    if len(p_list_kv)>=_count:
+                        if "children" in tree:
+                            del tree["children"]
+                    else:
+                        tree["text"] = ""
+                    return len(p_list_kv),has_table
 
-            return _count
+                return _count,has_table
+            else:
+                list_kv = extract_kv_from_node(tree)
+                return len(list_kv),False
         else:
-            list_kv = extract_kv_from_node(tree)
-            return len(list_kv)
-
+            return len(tree.get("kv",[])),True
+    return 0,False
 
 def update_kv_span(list_kv,append_length):
     for _d in list_kv:
-        _d["key_span"][0] += append_length
-        _d["key_span"][1] += append_length
-        _d["value_span"][0] += append_length
-        _d["value_span"][1] += append_length
-
-def get_sentence_from_tree(tree,list_sentences=None):
+        _d["position"] = {"key_begin_sentence":0,
+                              "key_begin_sentence_start":_d.get("key_sen_index",0),
+                              "key_end_sentence":0,
+                              "key_end_sentence_end":_d.get("key_sen_index",0)+len(_d.get("key","")),
+                              "value_begin_sentence":0,
+                              "value_begin_sentence_start":_d.get("value_sen_index",0),
+                              "value_end_sentence":0,
+                              "value_end_sentence_end":_d.get("value_sen_index",0)+len(_d.get("value",""))
+                              }
+        _d["position"]["key_begin_sentence_start"] += append_length
+        _d["position"]["key_end_sentence_end"] += append_length
+        _d["position"]["value_begin_sentence_start"] += append_length
+        _d["position"]["value_end_sentence_end"] += append_length
+
+def get_outobjs_from_tree(tree,list_outobjs=None):
 
     is_first = False
-    if list_sentences is None:
-        list_sentences = []
+    if list_outobjs is None:
+        list_outobjs = []
         is_first = True
     if isinstance(tree,list):
         for child in tree:
-            get_sentence_from_tree(child,list_sentences)
+            get_outobjs_from_tree(child,list_outobjs)
     if isinstance(tree,dict):
         childs = tree.get("children",[])
         _text = tree.get("text","")
-        if _text!="":
-            tree.name = tree.tag
-            list_sentences.append(tree)
-        for child in childs:
-            get_sentence_from_tree(child,list_sentences)
-    if is_first:
-        wordOffset_begin = 0
-        wordOffset_end = 0
-        for _i in range(len(list_sentences)):
-            list_sentences[_i].sentence_index = _i
-            list_sentences[_i].wordOffset_begin = wordOffset_begin
-            wordOffset_end = wordOffset_begin+len(list_sentences[_i].text)
-            list_sentences[_i].wordOffset_end = wordOffset_end
-            wordOffset_begin = wordOffset_end
+        is_table = True if tree.get("tag","")=="table" else False
+        if is_table:
+            list_outobjs.append(tree)
+        else:
+            if _text!="":
+                tree.name = tree.tag
+                list_outobjs.append(tree)
+            for child in childs:
+                get_outobjs_from_tree(child,list_outobjs)
 
-    return list_sentences
+    return list_outobjs
 
 
 def standard_title_context(_title_context):
@@ -544,10 +898,8 @@ class Html2KVTree():
         else:
 
             _tree = html_to_tree(html_content)
-            extract_kv_from_tree(_tree)
-            self.list_obj = get_sentence_from_tree(_tree)
+            self.list_obj = get_outobjs_from_tree(_tree)
 
-            print(len(self.list_obj))
 
 
         # for obj in self.list_obj:
@@ -803,13 +1155,17 @@ class Html2KVTree():
             sentence_index = obj.sentence_index
             wordOffset_begin = obj.wordOffset_begin
             wordOffset_end = obj.wordOffset_end
+            sentences = obj.sentences
 
             list_kv = obj.get("kv",[])
 
+            table_id = obj.get("table_id")
+
             list_table = None
             block = False
 
             has_product = False
+            position = obj.get("position",{})
 
             if _type=="sentence":
                 if _text in illegal_sentence:
@@ -874,6 +1230,7 @@ class Html2KVTree():
                         list_data[-1]["line_width"] = len(_text)
                         update_kv_span(list_kv,len(_text))
                         list_data[-1]["kv"].extend(list_kv)
+                        list_data[-1]["sentences"].extend(sentences)
                         _append = True
                     elif sentence_title is None and len(list_data)>0 and _type==list_data[-1]["type"]:
                         if list_data[-1]["line_width"]>=max_length*0.7:
@@ -881,51 +1238,20 @@ class Html2KVTree():
                             list_data[-1]["line_width"] = len(_text)
                             update_kv_span(list_kv,len(_text))
                             list_data[-1]["kv"].extend(list_kv)
+                            list_data[-1]["sentences"].extend(sentences)
                             _append = True
 
-            if _type=="table":
-                _soup = BeautifulSoup(_text,"lxml")
-                _table = _soup.find("table")
-                if _table is not None:
-                    list_table = getTable(_table)
-                    if len(list_table)==0:
-                        continue
-                    table_columns = len(list_table[0])
-
-                    if auto_merge_table:
-                        if last_table_index is not None and abs(obj_i-last_table_index)<=2 and last_table_columns is not None and last_table_columns==table_columns:
-                            if last_table is not None:
-                                trs = getTrs(_table)
-                                last_tbody = BeautifulSoup(last_table["text"],"lxml")
-                                _table = last_tbody.find("table")
-                                last_trs = getTrs(_table)
-                                _append = True
-
-                                for _line in list_table:
-                                    last_table["list_table"].append(_line)
-                                if len(last_trs)>0:
-                                    for _tr in trs:
-                                        last_trs[-1].insert_after(copy.copy(_tr))
-                                    last_table["text"] = re.sub("</?html>|</?body>","",str(last_tbody))
-
-                                last_table_index = obj_i
-                                last_table_columns = len(list_table[-1])
-
 
             if not _append:
-                _data = {"type":_type, "text":_text,"list_table":list_table,"line_width":len(_text),"sentence_title":sentence_title,"title_index":title_index,
+                _data = {"type":_type,"tag":obj.get("tag"),"table_id":table_id, "text":_text,"sentences":sentences,"list_table":list_table,
+                         "line_width":len(_text),"sentence_title":sentence_title,"title_index":title_index,
                          "sentence_title_text":sentence_title_text,"sentence_groups":sentence_groups,"parent_title":parent_title,
                          "child_title":childs,"title_before":title_before,"title_after":title_after,"title_next":title_next,"next_index":next_index,
                          "block":block,"has_product":has_product,
                          "sentence_index":sentence_index,"wordOffset_begin":wordOffset_begin,"wordOffset_end":wordOffset_end,
-                         "kv":list_kv
+                         "kv":list_kv,"position":position
                         }
 
-                if _type=="table":
-                    last_table = _data
-                    last_table_index = obj_i
-                    if list_table:
-                        last_table_columns = last_table_columns = len(list_table[-1])
 
                 if sentence_title is not None:
                     if len(list_data)>0:
@@ -1068,10 +1394,105 @@ class Html2KVTree():
 
         return list_data
 
-    def get_sentence_tree(self):
-        return self.tree
+    def get_tree_sentence(self):
+        list_sentence = []
+        for obj in self.tree:
+            list_sentence.extend(obj.get("sentences",[]))
+
+        return list_sentence
+
+
+    def extract_kvs_from_table(self,list_pattern,tree=None,result_kv=None):
+        if result_kv is None:
+            result_kv = [[] for i in list_pattern]
+            try:
+                for pattern in list_pattern:
+                    re.compile(pattern)
+            except Exception as e:
+                log("list_pattern error: "+str(e))
+                return result_kv
+        if tree is None:
+            tree = self.tree
+        for obj in tree:
+            is_table = True if obj.get("tag","")=="table" else False
+            if is_table:
+
+                table_id = obj.get("table_id")
+                list_kv = obj.get("kv")
+                for _pi in range(len(list_pattern)):
+                    table_kvs = []
+                    for _d0 in list_kv:
+                        _k = _d0.get("key","")
+                        _v = _d0.get("value","")
+                        _d = {"key":_k,"value":_v,"position":_d0.get("position",{})}
+                        if re.search(list_pattern[_pi],_k) is not None:
+                            table_kvs.append(_d)
+                    if table_kvs:
+                        result_kv[_pi].append({"table_id":table_id,"kv":table_kvs})
+                childs = obj.get("children",[])
+                for child in childs:
+                    self.extract_kvs_from_table(list_pattern,child,result_kv)
+        return result_kv
 
-    def extract_kv(self,k_pattern,from_kv=True,from_outline=True):
+    def extract_kvs_from_sentence(self,list_pattern,tree=None,result_kv=None):
+        if result_kv is None:
+            result_kv = [[] for i in list_pattern]
+            try:
+                for pattern in list_pattern:
+                    re.compile(pattern)
+            except Exception as e:
+                log("list_pattern error: "+str(e))
+                return result_kv
+        if tree is None:
+            tree = self.tree
+        for obj in tree:
+            is_table = True if obj.get("tag","")=="table" else False
+            if not is_table:
+                list_kv = obj.get("kv",[])
+                for _pi in range(len(list_pattern)):
+                    for _d in list_kv:
+                        _k = _d.get("key","")
+                        _v = _d.get("value","")
+                        if re.search(list_pattern[_pi],_k) is not None:
+                            result_kv[_pi].append(_d)
+        return result_kv
+
+    def extract_kvs_from_outline(self,list_pattern,tree=None,result_kv=None):
+        if result_kv is None:
+            result_kv = [[] for i in list_pattern]
+            try:
+                for pattern in list_pattern:
+                    re.compile(pattern)
+            except Exception as e:
+                log("list_pattern error: "+str(e))
+                return result_kv
+        if tree is None:
+            tree = self.tree
+        for obj in tree:
+            is_table = True if obj.get("tag","")=="table" else False
+            if not is_table:
+
+                _text = obj["text"]
+
+                for _pi in range(len(list_pattern)):
+
+                    sentence_index_from = obj["sentence_index"]
+                    sentence_index_to = sentence_index_from
+
+                    if re.search(list_pattern[_pi],_text) is not None and obj.get("sentence_title") is not None:
+
+                        childs = get_childs([obj])
+                        _child_text = ""
+                        for _child in childs:
+                            sentence_index_to = _child["sentence_index"]
+                            _child_text+=_child["text"]
+                        result_kv[_pi].append({"key":_text,"value":_child_text,"from_outline":True,"key_sentence_index_from":sentence_index_from,
+                                          "key_sentence_index_to":sentence_index_from,"value_sentence_index_from":sentence_index_from,
+                                          "value_sentence_index_to":sentence_index_to,})
+        return result_kv
+
+
+    def extract_kv(self,k_pattern,from_sentence=True,from_outline=True,from_table=True):
         result_kv = []
         try:
             re.compile(k_pattern)
@@ -1079,34 +1500,26 @@ class Html2KVTree():
             log("k_pattern error: "+str(e))
             traceback.print_exc()
             return result_kv
-        for sentence in self.tree:
-            _text = sentence["text"]
-            list_kv = sentence.get("kv",[])
-            sentence_index_from = sentence["sentence_index"]
-            sentence_index_to = sentence_index_from
-            if from_kv:
-                for _d in list_kv:
-                    _k = _d.get("key","")
-                    _v = _d.get("value","")
-                    _k_span = _d.get("key_span",[])
-                    _v_span = _d.get("value_span",[])
-                    if re.search(k_pattern,_k) is not None:
-                        result_kv.append({"key":_k,"value":_v,"from_kv":True,"key_sentence_index_from":sentence_index_from,
-                                          "key_sentence_index_to":sentence_index_to,"value_sentence_index_from":sentence_index_from,
-                                          "value_sentence_index_to":sentence_index_to,
-                                          "key_span":_k_span,
-                                          "value_span":_v_span})
-            if from_outline:
-                if re.search(k_pattern,_text) is not None and sentence.get("sentence_title") is not None:
-
-                    childs = get_childs([sentence])
-                    _child_text = ""
-                    for _child in childs:
-                        sentence_index_to = _child["sentence_index"]
-                        _child_text+=_child["text"]
-                    result_kv.append({"key":_text,"value":_child_text,"from_outline":True,"key_sentence_index_from":sentence_index_from,
-                                      "key_sentence_index_to":sentence_index_from,"value_sentence_index_from":sentence_index_from,
-                                      "value_sentence_index_to":sentence_index_to,})
+        result_kv = []
+        if from_table:
+            result_kv_table = self.extract_kvs_from_table([k_pattern])
+            for table_d in result_kv_table[0]:
+                table_id = table_d.get("table_id")
+                table_kvs = table_d.get("kv",[])
+                for _d in table_kvs:
+                    _d["from_table"] = True
+                result_kv.extend(table_kvs)
+        if from_sentence:
+            result_kv_sentence = self.extract_kvs_from_sentence([k_pattern])
+            for _d in result_kv_sentence[0]:
+                _d["from_sentence"] = True
+            result_kv.extend(result_kv_sentence[0])
+        if from_outline:
+            result_kv_outline = self.extract_kvs_from_outline([k_pattern])
+            for _d in result_kv_outline[0]:
+                _d["from_outline"] = True
+            result_kv.extend(result_kv_outline[0])
+
         return result_kv
 
     # def extract_kvs_from_table(self,list_pattern):
@@ -1119,57 +1532,177 @@ if __name__ == '__main__':
     html_content = """
 <div>
  <div>
-   中介服务交易|中介服务交易;中选公示;广东省中介超市交易系统 
- </div> 
- <div>
-   采购项目编码: 4451024573035462411180923 
- </div> 
- <div>
-   采购项目名称: 意溪中学办公楼一楼卫生间、保密室、运动场围墙及零碎修缮工程(结算审核) 
+  <a target="_blank" class="markBlue" href="/bdqyhx/211521581770297344.html" style="color: #3083EB !important;text-decoration: underline;">辽宁机电职业技术学院</a>食堂账户开立项目(二次)竞争性磋商公告
  </div> 
- <div>
-   项目业主名称: <a target="_blank" class="markBlue" href="/bdqyhx/212724763956953088.html" style="color: #3083EB !important;text-decoration: underline;">潮州市湘桥区意溪中学</a> 
- </div> 
- <div>
-   中介服务事项: 无(属于非行政管理的中介服务项目采购) 
- </div> 
- <div>
-   投资审批项目编码: 
- </div> 
- <div>
-   服务金额: 暂不做评估与测算 
- </div> 
- <div>
-   金额说明: 按粤价函【2011】742号文及潮财建【2019】19号文的收费标准执行 (行业收费标准)计算。 
- </div> 
- <div>
-   选取中介机构方式: 直接选取 
+ <div> 
+  <div> 
+   <blockquote> 
+    <p>项目概况</p> 
+    <p>辽宁机电职业技术学院食堂账户开立项目 采购项目的潜在供应商应在<a target="_blank" class="markBlue" href="/bdqyhx/215785544561414148.html" style="color: #3083EB !important;text-decoration: underline;">辽宁顺业工程咨询有限公司</a>(丹东市振兴区纤维南路1-2-7号)获取采购文件,并于2024年11月26日 14点00分(北京时间)前提交响应文件。</p> 
+   </blockquote> 
+   <p><strong>一、项目基本情况</strong></p> 
+   <p>项目编号:LNSY-2024101702</p> 
+   <p>项目名称:<a target="_blank" class="markBlue" href="/bdqyhx/211521581770297344.html" style="color: #3083EB !important;text-decoration: underline;">辽宁机电职业技术学院</a>食堂账户开立项目</p> 
+   <p>采购方式:竞争性磋商</p> 
+   <p>预算金额:0.000000 万元(人民币)</p> 
+   <p>最高限价(如有):0.000000 万元(人民币)</p> 
+   <p>采购需求:</p> 
+   <p>本项目为丹东市<a target="_blank" class="markBlue" href="/bdqyhx/211521581770297344.html" style="color: #3083EB !important;text-decoration: underline;">辽宁机电职业技术学院</a>食堂账户开立项目采购项目,银行须提供给学校所需的包括日常资金结算服务、代收代付、转账汇款、账户对帐服务、年检服务、网银操作、安全保障等在内的一切配套基本服务。(具体详见第三章服务需求)</p> 
+   <p></p> 
+   <p>合同履行期限:5年 </p> 
+   <p>本项目(不接受 )联合体投标。</p> 
+   <p><strong>二、申请人的资格要求:</strong></p> 
+   <p>1.满足《中华人民共和国政府采购法》第二十二条规定;</p> 
+   <p>2.落实政府采购政策需满足的资格要求:</p> 
+   <p>无</p> 
+   <p></p> 
+   <p>3.本项目的特定资格要求:投标人应属于在中华人民共和国境内依法设立的国有商业银行、股份制商业银行、邮政储蓄银行、城市商业银行、农村商业银行、农村合作银行及政策性银行,并符合以下条件;(一)在甲方所在地设有分支机构;(二)在甲方所在地范围内依法开展经营活动,内部管理机制健全,具有较强的风险控制能力,近3年内在经营活动中无重大违法违规记录、未发生金融风险及重大违约事件。(三)投标人若为支行,须提供总行或丹东市分行针对本项目唯一授权书。各投标主体不得隶属于同一法人。不接受联合体投标。投标人若为总行或丹东市分行,须提供业务承办银行确认函(明确中标后承办本项目业务的银行名称)。(四)有专人负责办理相关业务。</p> 
+   <p><strong>三、获取采购文件</strong></p> 
+   <p>时间:2024年11月15日 至2024年11月25日,每天上午8:30至11:30,下午13:00至16:30。(北京时间,法定节假日除外)</p> 
+   <p>地点:<a target="_blank" class="markBlue" href="/bdqyhx/215785544561414148.html" style="color: #3083EB !important;text-decoration: underline;">辽宁顺业工程咨询有限公司</a>(丹东市振兴区纤维南路1-2-7号)</p> 
+   <p>方式:现场或电子邮件领取</p> 
+   <p>售价:¥500.0 元(人民币)</p> 
+   <p><strong>四、响应文件提交</strong></p> 
+   <p>截止时间:2024年11月26日 14点00分(北京时间)</p> 
+   <p>地点:<a target="_blank" class="markBlue" href="/bdqyhx/215785544561414148.html" style="color: #3083EB !important;text-decoration: underline;">辽宁顺业工程咨询有限公司</a>(丹东市振兴区纤维南路1-2-7号)</p> 
+   <p><strong>五、开启</strong></p> 
+   <p>时间:2024年11月26日 14点00分(北京时间)</p> 
+   <p>地点:<a target="_blank" class="markBlue" href="/bdqyhx/215785544561414148.html" style="color: #3083EB !important;text-decoration: underline;">辽宁顺业工程咨询有限公司</a>(丹东市振兴区纤维南路1-2-7号)</p> 
+   <p><strong>六、公告期限</strong></p> 
+   <p>自本公告发布之日起3个工作日。</p> 
+   <p><strong>七、其他补充事宜</strong></p> 
+   <p></p> 
+   <p>(一)质疑与投诉</p> 
+   <p>供应商认为自己的权益受到损害的,可以在知道或者应知其权益受到损害之日起七个工作日内,向采购代理机构或采购人提出质疑。</p> 
+   <p>1、接收质疑函方式:书面纸质或电子质疑函</p> 
+   <p>2、质疑函内容、格式:应符合《政府采购质疑和投诉办法》相关规定和财政部制定的《政府采购质疑函范本》格式,详见辽宁政府采购网。</p> 
+   <p>质疑供应商对采购人、采购代理机构的答复不满意,或者采购人、采购代理机构未在规定时间内作出答复的,可以在答复期满后15个工作日内向本级财政部门提起投诉。</p> 
+   <p>(二)购买采购文件时须提供以下材料(以下材料均须加盖单位公章):</p> 
+   <ol> 
+    <li value="NaN">法人或者其他组织的营业执照等主体证明文件或自然人的身份证明复印件(自然人身份证明仅限在自然人作为响应主体时使用);</li> 
+    <li value="NaN">法定代表人(或非法人组织负责人)身份证明书原件(附法定代表人身份证复印件)(自然人作为响应主体时不需提供);</li> 
+    <li value="NaN">授权委托书原件(附授权委托人身份证复印件)(法定代表人、非法人组织负责人、自然人本人购买采购文件的无需提供);</li> 
+   </ol> 
+   <p>注:电子邮件方式领取采购文件的供应商,将上述材料加盖公章的扫描件发送至指定邮箱(lnsy9688@163.com)并致电0415-2199688,主题写明“供应商名称、项目名称、联系人、联系电话”,在领取采购文件截止时间前资料审查通过后,代理机构将采购文件电子版发送至供应商邮箱。</p> 
+   <p></p> 
+   <p></p> 
+   <p><strong>八、凡对本次采购提出询问,请按以下方式联系。</strong></p> 
+   <p>1.采购人信息</p> 
+   <p>名 称:<a target="_blank" class="markBlue" href="/bdqyhx/211521581770297344.html" style="color: #3083EB !important;text-decoration: underline;">辽宁机电职业技术学院</a>     </p> 
+   <p>地址:丹东市振兴区洋河大街30号        </p> 
+   <p>联系方式:王老师0415-3853804      </p> 
+   <p>2.采购代理机构信息</p> 
+   <p>名 称:<a target="_blank" class="markBlue" href="/bdqyhx/215785544561414148.html" style="color: #3083EB !important;text-decoration: underline;">辽宁顺业工程咨询有限公司</a>            </p> 
+   <p>地 址:<a target="_blank" class="markBlue" href="/bdqyhx/215785544561414148.html" style="color: #3083EB !important;text-decoration: underline;">辽宁顺业工程咨询有限公司</a>(丹东市振兴区纤维南路1-2-7号)            </p> 
+   <p>联系方式:吴平0415-2199688            </p> 
+   <p>3.项目联系方式</p> 
+   <p>项目联系人:吴平</p> 
+   <p>电 话:  04152199688</p>  
+   <p></p> 
+  </div> 
  </div> 
- <div>
-   业务单位咨询电话: 13715781262 
- </div> 
- <div>
-   监督举报: 
- </div> 
- <div>
-   中选中介机构名称: <a target="_blank" class="markBlue" href="/bdqyhx/216479817075470337.html" style="color: #3083EB !important;text-decoration: underline;">中庭国际设计有限公司</a> 
- </div> 
- <div>
-   中介机构联系地址: 广州市天河区大观南路26号C203A 
+ <div> 
+  <div>
+   公告概要:
+  </div> 
+  <table width="600"> 
+   <tbody> 
+    <tr> 
+     <td colspan="4"><b>公告信息:</b></td> 
+    </tr> 
+    <tr> 
+     <td width="128">采购项目名称</td> 
+     <td colspan="3" width="430"><a target="_blank" class="markBlue" href="/bdqyhx/211521581770297344.html" style="color: #3083EB !important;text-decoration: underline;">辽宁机电职业技术学院</a>食堂账户开立项目</td> 
+    </tr> 
+    <tr> 
+     <td>品目</td> 
+     <td colspan="3"><p>服务/金融服务/银行服务/其他银行服务</p></td> 
+    </tr> 
+    <tr> 
+     <td>采购单位</td> 
+     <td colspan="3"><a target="_blank" class="markBlue" href="/bdqyhx/211521581770297344.html" style="color: #3083EB !important;text-decoration: underline;">辽宁机电职业技术学院</a></td> 
+    </tr> 
+    <tr> 
+     <td>行政区域</td> 
+     <td width="168">振兴区</td> 
+     <td width="128">公告时间</td> 
+     <td width="168">2024年11月14日 15:25</td> 
+    </tr> 
+    <tr> 
+     <td>获取采购文件时间</td> 
+     <td colspan="3">2024年11月15日至2024年11月25日<br>每日上午:8:30 至 11:30下午:13:00 至 16:30(北京时间,法定节假日除外)</td> 
+    </tr> 
+    <tr> 
+     <td>响应文件递交地点</td> 
+     <td colspan="3"><a target="_blank" class="markBlue" href="/bdqyhx/215785544561414148.html" style="color: #3083EB !important;text-decoration: underline;">辽宁顺业工程咨询有限公司</a>(丹东市振兴区纤维南路1-2-7号)</td> 
+    </tr> 
+    <tr> 
+     <td>响应文件开启时间</td> 
+     <td colspan="3">2024年11月26日 14:00</td> 
+    </tr> 
+    <tr> 
+     <td>响应文件开启地点</td> 
+     <td colspan="3"><a target="_blank" class="markBlue" href="/bdqyhx/215785544561414148.html" style="color: #3083EB !important;text-decoration: underline;">辽宁顺业工程咨询有限公司</a>(丹东市振兴区纤维南路1-2-7号)</td> 
+    </tr> 
+    <tr> 
+     <td>预算金额</td> 
+     <td colspan="3">¥0.000000万元(人民币)</td> 
+    </tr> 
+    <tr> 
+     <td colspan="4"><b>联系人及联系方式:</b></td> 
+    </tr> 
+    <tr> 
+     <td>项目联系人</td> 
+     <td colspan="3">吴平</td> 
+    </tr> 
+    <tr> 
+     <td>项目联系电话</td> 
+     <td colspan="3">04152199688</td> 
+    </tr> 
+    <tr> 
+     <td width="128">采购单位</td> 
+     <td width="430" colspan="3"><a target="_blank" class="markBlue" href="/bdqyhx/211521581770297344.html" style="color: #3083EB !important;text-decoration: underline;">辽宁机电职业技术学院</a></td> 
+    </tr> 
+    <tr> 
+     <td>采购单位地址</td> 
+     <td colspan="3">丹东市振兴区洋河大街30号</td> 
+    </tr> 
+    <tr> 
+     <td>采购单位联系方式</td> 
+     <td colspan="3">王老师0415-3853804</td> 
+    </tr> 
+    <tr> 
+     <td>代理机构名称</td> 
+     <td colspan="3"><a target="_blank" class="markBlue" href="/bdqyhx/215785544561414148.html" style="color: #3083EB !important;text-decoration: underline;">辽宁顺业工程咨询有限公司</a></td> 
+    </tr> 
+    <tr> 
+     <td>代理机构地址</td> 
+     <td colspan="3"><a target="_blank" class="markBlue" href="/bdqyhx/215785544561414148.html" style="color: #3083EB !important;text-decoration: underline;">辽宁顺业工程咨询有限公司</a>(丹东市振兴区纤维南路1-2-7号)</td> 
+    </tr> 
+    <tr> 
+     <td>代理机构联系方式</td> 
+     <td colspan="3">吴平0415-2199688</td> 
+    </tr> 
+   </tbody> 
+  </table> 
  </div>
 </div>
     """
     _tree = html_to_tree(html_content)
-    extract_kv_from_tree(_tree)
 
-    list_sentences = get_sentence_from_tree(_tree)
+
     _pd = Html2KVTree(html_content)
     _pd.print_tree(_pd.tree,"-|")
 
-
-    list_kv = _pd.extract_kv("交货地点")
+    list_kv = _pd.extract_kv("获取采购文件时间")
     print(list_kv)
 
+    #获取预处理后的所有句子,该句子与kv值对应
+    print(_pd.get_tree_sentence())
+
+    # soup = BeautifulSoup(html_content,"lxml")
+    # table_tree = table_to_tree(soup)
+    # print(json.dumps(table_tree,ensure_ascii=False))
 
 
 

+ 1 - 1
BiddingKG/dl/interface/htmlparser.py

@@ -297,7 +297,7 @@ class ParseDocument():
                 if v is not None:
                     groups.append((k,v))
         if len(groups):
-            # groups.sort(key=lambda x:x[0])
+            groups.sort(key=lambda x:x[0])
             return groups
         return None
 

+ 14 - 5
BiddingKG/dl/interface/predictor.py

@@ -6583,7 +6583,7 @@ class DistrictPredictor():
 
 class TableTag2List():
     '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
-    def table2list(self, table, text_process=None, return_html_table=False):
+    def table2list(self, table, text_process=None, return_html_table=False,return_kv=False):
         self._output = []
         row_ind = 0
         col_ind = 0
@@ -6626,14 +6626,23 @@ class TableTag2List():
                             if 'title' in cell.attrs and cell.get_text().strip().endswith('...') and cell.get_text().strip()[:-3] in cell.attrs['title']:
                                 td_text = cell.attrs['title']  # 修复 类似 215597851 省略号隐藏内容
                             elif len(td_text)>30:
-                                td_text = re.sub('\xa0', '', text_process(cell, final=False))
+                                if return_kv:
+                                    td_text = cell.get_text()
+                                else:
+                                    td_text = re.sub('\xa0', '', text_process(cell, final=False))
                             if td_text == "":
                                 td_text = ' '
                             text = [td_text,0]
                         else:
-                            text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
-                            # text = re.sub('\s', '', text)[:200] # 只需取前200字即可
-                            text = ' ' if text == "" else text
+                            if return_kv:
+                                td_text = cell.get_text()
+                            else:
+                                td_text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
+                            text = [td_text,0]
+
+                            # text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
+                            # # text = re.sub('\s', '', text)[:200] # 只需取前200字即可
+                            # text = ' ' if text == "" else text
 
                         self._insert(row_ind, col_ind, row_span, col_span, text)
                         if return_html_table: