|
@@ -688,6 +688,7 @@ class PDFConvert:
|
|
if isinstance(y, LTImage):
|
|
if isinstance(y, LTImage):
|
|
lt_image_list.append(y)
|
|
lt_image_list.append(y)
|
|
image_count += 1
|
|
image_count += 1
|
|
|
|
+ lt_text_list = self.delete_water_mark(lt_text_list)
|
|
|
|
|
|
# 若只有文本且图片数为0,直接提取文字及表格
|
|
# 若只有文本且图片数为0,直接提取文字及表格
|
|
if only_image == 0 and image_count == 0:
|
|
if only_image == 0 and image_count == 0:
|
|
@@ -708,6 +709,7 @@ class PDFConvert:
|
|
(float(_edge["x1"]), float(_edge["y1"]))))
|
|
(float(_edge["x1"]), float(_edge["y1"]))))
|
|
list_tables, filter_objs, _ = self.lt.recognize_table(lt_text_list, lt_line_list)
|
|
list_tables, filter_objs, _ = self.lt.recognize_table(lt_text_list, lt_line_list)
|
|
self._page.in_table_objs = filter_objs
|
|
self._page.in_table_objs = filter_objs
|
|
|
|
+
|
|
for table in list_tables:
|
|
for table in list_tables:
|
|
_table = _Table(table["table"], table["bbox"])
|
|
_table = _Table(table["table"], table["bbox"])
|
|
# self._page.children.append(_table)
|
|
# self._page.children.append(_table)
|
|
@@ -715,11 +717,10 @@ class PDFConvert:
|
|
|
|
|
|
list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
|
|
list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
|
|
layout.bbox, page_no)
|
|
layout.bbox, page_no)
|
|
|
|
+
|
|
|
|
+
|
|
for sentence in list_sentences:
|
|
for sentence in list_sentences:
|
|
_sen = _Sentence(sentence.text, sentence.bbox)
|
|
_sen = _Sentence(sentence.text, sentence.bbox)
|
|
- # _sen.x = sentence.x0
|
|
|
|
- # _sen.y = sentence.y0
|
|
|
|
- # self._page.children.append(_sen)
|
|
|
|
self._page.add_child(_sen)
|
|
self._page.add_child(_sen)
|
|
# pdf对象需反向排序
|
|
# pdf对象需反向排序
|
|
self._page.is_reverse = True
|
|
self._page.is_reverse = True
|
|
@@ -889,6 +890,26 @@ class PDFConvert:
|
|
return self._doc.error_code
|
|
return self._doc.error_code
|
|
return self._doc.get_html()
|
|
return self._doc.get_html()
|
|
|
|
|
|
|
|
+ def delete_water_mark(self, lt_text_list, times=8):
|
|
|
|
+ # 删除过多重复字句,为水印
|
|
|
|
+ duplicate_dict = {}
|
|
|
|
+ for _obj in lt_text_list:
|
|
|
|
+ t = _obj.get_text()
|
|
|
|
+ if t in duplicate_dict.keys():
|
|
|
|
+ duplicate_dict[t] += 1
|
|
|
|
+ else:
|
|
|
|
+ duplicate_dict[t] = 1
|
|
|
|
+ delete_text = []
|
|
|
|
+ for t in duplicate_dict.keys():
|
|
|
|
+ if duplicate_dict[t] >= times:
|
|
|
|
+ delete_text.append(t)
|
|
|
|
+ temp_text_list = []
|
|
|
|
+ for _obj in lt_text_list:
|
|
|
|
+ t = _obj.get_text()
|
|
|
|
+ if t not in delete_text:
|
|
|
|
+ temp_text_list.append(_obj)
|
|
|
|
+ return temp_text_list
|
|
|
|
+
|
|
|
|
|
|
# 以下为现成pdf单页解析接口
|
|
# 以下为现成pdf单页解析接口
|
|
class ParseSentence:
|
|
class ParseSentence:
|