pdfparser.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712
  1. #coding:utf8
  2. from pdfminer.pdfparser import PDFParser
  3. from pdfminer.pdfdocument import PDFDocument
  4. from pdfminer.pdfpage import PDFPage
  5. from pdfminer.pdfpage import PDFTextExtractionNotAllowed
  6. from pdfminer.pdfinterp import PDFResourceManager
  7. from pdfminer.pdfinterp import PDFPageInterpreter
  8. from pdfminer.pdfdevice import PDFDevice
  9. from pdfminer.layout import *
  10. from pdfminer.converter import PDFPageAggregator
  11. import re
  12. from PyPDF2 import PdfFileReader as pfr
  13. import logging
  14. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  15. from service.extract.utils.tableutils import LineTable
  16. from pdfplumber.page import Page as pdfPage
  17. from pdfplumber.table import TableFinder
  18. from pdfplumber.pdf import PDF
  19. from io import BytesIO
  20. from scipy.optimize import linear_sum_assignment
  21. class ParseDocument():
  22. def __init__(self,filepath):
  23. self.filename = filepath
  24. self.childs = []
  25. self.linetable = LineTable()
  26. # Open a PDF file.
  27. fp = open(filepath, 'rb')
  28. # Create a PDF parser object associated with the file object.
  29. parser = PDFParser(fp)
  30. # Create a PDF document object that stores the document structure.
  31. # Supply the password for initialization.
  32. # document = PDFDocument(parser)
  33. # Check if the document allows text extraction. If not, abort.
  34. # if not document.is_extractable:
  35. # raise PDFTextExtractionNotAllowed
  36. # Create a PDF resource manager object that stores shared resources.
  37. rsrcmgr = PDFResourceManager()
  38. # Create a PDF device object.
  39. laparams = LAParams(line_overlap=0.01,
  40. char_margin=0.05,
  41. line_margin=0.01,
  42. word_margin=0.01,
  43. boxes_flow=0.1,)
  44. device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  45. # Create a PDF interpreter object.
  46. interpreter = PDFPageInterpreter(rsrcmgr, device)
  47. # Process each page contained in the document.
  48. # outlines = document.get_outlines()
  49. list_sentences = []
  50. self.whole_childs = []
  51. page_no = 0
  52. doctop = 0
  53. _pdf = PDF(fp,laparams=laparams.__dict__)
  54. for page in PDFPage.create_pages(_pdf.doc):
  55. pdf_page = pdfPage(_pdf, page, page_number=page_no, initial_doctop=doctop)
  56. doctop += pdf_page.height
  57. interpreter.process_page(page)
  58. ltpage = device.get_result()
  59. page_no += 1
  60. logging.info("recognize page:%d"%page_no)
  61. # if page_no in (34,35):
  62. # print(ltpage.__dict__)
  63. # r_page = ParsePage(self.linetable,ltpage,pdf_page,page_no)
  64. # # self.childs.append(r_page)
  65. # # break
  66. # else:
  67. # continue
  68. r_page = ParsePage(self.linetable,ltpage,pdf_page,page_no)
  69. self.childs.append(r_page)
  70. # print(ltpage.__dict__)
  71. # ParsePage(ltpage).recognize_rect(ltpage)
  72. # if page_no==6:
  73. # print(ltpage.__dict__)
  74. #
  75. # print("====")
  76. # print(r_page.childs)
  77. # if page_no>10:
  78. # break
  79. self.fixPages()
  80. self.buildParsetree()
  81. #识别目录树
  82. for _page in self.childs:
  83. print("%d============"%_page.page_no)
  84. for _sentence in _page.childs:
  85. print(_sentence)
  86. print("%d================"%_page.page_no)
  87. if self.parseTree:
  88. self.parseTree.printParseTree()
  89. def fixPages(self,margin=2):
  90. for _page in self.childs:
  91. _page.fixSentences()
  92. for _i in range(len(self.childs)-1):
  93. p_i = len(self.childs)-_i-1
  94. last_p_i = p_i -1
  95. _page = self.childs[p_i]
  96. l_page = self.childs[last_p_i]
  97. if len(_page.childs)>0 and len(l_page.childs)>0:
  98. _child = _page.childs[0]
  99. l_child = l_page.childs[-1]
  100. if isinstance(_child,(ParseTable)) and isinstance(l_child,(ParseTable)):
  101. if abs(_child.bbox[0]-l_child.bbox[0])<margin and abs(_child.bbox[2]-l_child.bbox[2])<margin:
  102. #todo make sure uniontable coright
  103. _addheight = 800
  104. for _line in _child.table:
  105. for _cell in _line:
  106. _addheight = max(_addheight,_cell["bbox"][3])
  107. _addheight += 100
  108. set_cell_id = set()
  109. for t_line in l_child.table:
  110. for _cell in t_line:
  111. _id = id(_cell)
  112. if _id not in set_cell_id:
  113. _cell["bbox"] = (_cell["bbox"][0],_addheight+_cell["bbox"][1],_cell["bbox"][2],_addheight+_cell["bbox"][3])
  114. set_cell_id.add(_id)
  115. _t = self.linetable.unionTable([_child.table,l_child.table])
  116. _table = ParseTable(_t["bbox"],_t["table"])
  117. l_page.childs[-1] = _table
  118. _page.childs.pop(0)
  119. pass
  120. if isinstance(_child,(ParseSentence)) and isinstance(l_child,(ParseSentence)):
  121. if not _child.is_outline and not _child.title:
  122. if abs(l_child.bbox[2]-l_page.bbox[2])<100:
  123. l_child.text += _child.text
  124. _page.childs.pop(0)
  125. self.getWholeChilds()
  126. def getWholeChilds(self):
  127. if len(self.whole_childs)>0:
  128. return self.whole_childs
  129. whole_childs = []
  130. for _page in self.childs:
  131. whole_childs.extend(_page.childs)
  132. self.whole_childs = whole_childs
  133. return self.whole_childs
  134. def get_next_title(self,_groups):
  135. next_title = ""
  136. if _groups is None or len(_groups)==0:
  137. return None
  138. for _g in _groups:
  139. if _g[0][-1]=="0":
  140. next_title += _g[1]
  141. else:
  142. next_title += ParseUtils.get_next_title(_g[1])
  143. return next_title
  144. def find_scopes(self,tree,whole_childs,begin,end,degree):
  145. if end<=begin:
  146. return
  147. list_index = []
  148. list_child = []
  149. for _index in range(begin,end+1):
  150. _child = whole_childs[_index]
  151. if isinstance(_child,ParseSentence):
  152. if not _child.is_outline and _child.title and _child.title_degree==degree:
  153. list_child.append(_child)
  154. list_index.append(_index)
  155. _graph = [[10000 for i in range(len(list_child))]for _ in range(len(list_child))]
  156. _prob = -9000
  157. for _i in range(len(list_child)):
  158. _child = list_child[_i]
  159. if ParseUtils.is_first_title(_child.title):
  160. _prob += 100
  161. if _child.groups is None:
  162. if _i<len(list_child)-1:
  163. _graph[_i][_i+1] = min(_prob,_graph[_i][_i+1])
  164. else:
  165. _next_title = self.get_next_title(_child.groups[1:])
  166. for _j in range(_i+1,len(list_child)):
  167. n_child = list_child[_j]
  168. # print("|",n_child.title_text,n_child.fontsize,n_child.fontname)
  169. if n_child.title_text.replace(".",".")==_next_title.replace(".",".") and int(_child.fontsize)==int(n_child.fontsize) and _child.fontname==n_child.fontname:
  170. _graph[_i][_j] = min(_prob,_graph[_i][_j])
  171. if len(list_child)==0:
  172. return
  173. rows,cols = linear_sum_assignment(_graph)
  174. r = rows[0]
  175. c = cols[0]
  176. while 1:
  177. if _graph[r][c]==10000 or r==len(list_child)-1 or c<=r:
  178. list_child[r].scope[1] = end
  179. _parseTree = ParseTree(tree,list_child[r],list_child[r].scope)
  180. tree.addChild(_parseTree)
  181. next_degree = None
  182. for i in range(list_child[r].scope[0]+1,list_child[r].scope[1]):
  183. _c = whole_childs[i]
  184. if isinstance(_c,ParseSentence) and not _c.is_outline and _c.title:
  185. next_degree = _c.title_degree
  186. break
  187. if next_degree:
  188. self.find_scopes(_parseTree,whole_childs,list_child[r].scope[0]+1,list_child[r].scope[1],next_degree)
  189. break
  190. list_child[r].scope[1] = list_child[c].scope[0]-1
  191. _parseTree = ParseTree(tree,list_child[r],list_child[r].scope)
  192. tree.addChild(_parseTree)
  193. next_degree = None
  194. for i in range(list_child[r].scope[0]+1,list_child[r].scope[1]):
  195. _c = whole_childs[i]
  196. # print(_c.__dict__.get("title"))
  197. if isinstance(_c,ParseSentence) and not _c.is_outline and _c.title :
  198. next_degree = _c.title_degree
  199. break
  200. if next_degree:
  201. self.find_scopes(_parseTree,whole_childs,list_child[r].scope[0]+1,list_child[r].scope[1],next_degree)
  202. r = rows[c]
  203. c = cols[r]
  204. def buildParsetree(self):
  205. self.parseTree = None
  206. whole_childs = self.getWholeChilds()
  207. list_degree = []
  208. _index = -1
  209. for _child in whole_childs:
  210. _index += 1
  211. _child.scope = [_index,_index]
  212. if isinstance(_child,ParseSentence):
  213. if _child.title_degree is not None:
  214. list_degree.append(_child.title_degree)
  215. if len(list_degree)==0:
  216. return
  217. first_degree = min(list_degree)
  218. print("first_degree",first_degree)
  219. self.parseTree = ParseTree(None,None,[0,len(whole_childs)])
  220. self.find_scopes(self.parseTree,whole_childs,0,len(whole_childs)-1,first_degree)
  221. pass
  222. class ParsePage():
  223. def __init__(self,lt,_page,pdf_page,page_no):
  224. self.page_no = page_no
  225. self.childs = []
  226. self.linetable = lt
  227. list_textbox = []
  228. list_line = []
  229. self.bbox = _page.bbox
  230. list_rect = []
  231. for _obj in _page._objs:
  232. # if isinstance(_obj,LTLine):
  233. # list_line.append(_obj)
  234. if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
  235. list_textbox.append(_obj)
  236. if isinstance(_obj,(LTRect)):
  237. list_rect.append(_obj)
  238. _tableFinder = TableFinder(pdf_page)
  239. for _edge in _tableFinder.get_edges():
  240. list_line.append(LTLine(1,(float(_edge["x0"]),float(_edge["y0"])),(float(_edge["x1"]),float(_edge["y1"]))))
  241. ParseUtils.getFontinfo(_page)
  242. tables,filter_objs,_ = self.linetable.recognize_table(list_textbox,list_line)
  243. # tables_rect,filter_objs_rect,_ = self.linetable.recognize_table_by_rect(list_textbox,list_rect)
  244. # print("====$$$",len(filter_objs))
  245. for _table in tables:
  246. self.childs.append(ParseTable(_table["bbox"],_table["table"]))
  247. # if len(filter_objs&filter_objs_rect)==0:
  248. # for _table in tables_rect:
  249. # self.childs.append(ParseTable(_table["bbox"],_table["table"]))
  250. # filter_objs = filter_objs & filter_objs_rect
  251. list_sentences = ParseUtils.recognize_sentences(list_textbox,filter_objs,_page.bbox,page_no)
  252. self.childs.extend(list_sentences)
  253. self.childs.sort(key=lambda x:x.bbox[3],reverse=True)
  254. def fixSentences(self):
  255. '''
  256. #fix the sentences of page by context
  257. :return:
  258. '''
  259. set_remove = set()
  260. for _i in range(1,len(self.childs)):
  261. _sentence = self.childs[_i]
  262. if not isinstance(_sentence,(ParseSentence)):
  263. continue
  264. if not _sentence.is_outline and not _sentence.title:
  265. if _i>0:
  266. _j = _i
  267. while 1:
  268. _j -= 1
  269. _sen_tmp = self.childs[_j]
  270. if isinstance(_sen_tmp,(ParseTable)):
  271. _j = -1
  272. break
  273. if _j not in set_remove and abs(_sen_tmp.bbox[2]-self.bbox[2])<100:
  274. break
  275. if _j<0:
  276. break
  277. if _j>=0:
  278. set_remove.add(_i)
  279. self.childs[_j].text += _sentence.text
  280. self.childs[_j].bbox = (min(_sentence.bbox[0],self.childs[_j].bbox[0]),min(_sentence.bbox[1],self.childs[_j].bbox[1]),
  281. max(_sentence.bbox[2],self.childs[_j].bbox[2]),max(_sentence.bbox[3],self.childs[_j].bbox[3]))
  282. list_remove = list(set_remove)
  283. list_remove.sort(key=lambda x:x,reverse=True)
  284. for _i in list_remove:
  285. self.childs.pop(_i)
  286. class ParseTree():
  287. def __init__(self,parent_tree,node,child_scope):
  288. self.parent_tree = parent_tree
  289. self.node = node
  290. self.childs = []
  291. self.child_scope = child_scope
  292. def setParent(self,parent_tree):
  293. self.parent_tree = parent_tree
  294. def addChild(self,tree):
  295. self.childs.append(tree)
  296. def printParseTree(self,degree=1):
  297. for p in self.childs:
  298. print("======%d====="%degree)
  299. print(p.node)
  300. p.printParseTree(degree+1)
  301. print("======%d====="%degree)
  302. class ParseTable():
  303. def __init__(self,bbox,_table):
  304. self.table = _table
  305. self.bbox = bbox
  306. def __repr__(self):
  307. _string = "table>>>>>>>>>>>>>>>>>>>>>>>>>\n"
  308. for _line in self.table:
  309. for _cell in _line:
  310. _string += "[%s]%s"%(_cell.get("text").replace("\n","")[:10],"\t\t")
  311. _string += "\n"
  312. return _string
  313. def getSentence(self):
  314. #todo transform table to sentence
  315. pass
  316. class ParseSentence():
  317. def __init__(self,bbox,fontname,fontsize,_text,_title,title_text,_pattern,title_degree,is_outline,outline_location,page_no):
  318. (x0,y0,x1,y1) = bbox
  319. self.x0 = x0
  320. self.y0 = y0
  321. self.x1 = x1
  322. self.y1 = y1
  323. self.bbox = bbox
  324. self.fontname = fontname
  325. self.fontsize = fontsize
  326. self.text = _text
  327. self.title = _title
  328. self.title_text = title_text
  329. self.groups = _pattern
  330. self.title_degree = title_degree
  331. self.is_outline = is_outline
  332. self.outline_location = outline_location
  333. self.page_no = page_no
  334. def __repr__(self):
  335. return "%s,%s,%s,%d,%s"%(self.text,self.title,self.is_outline,self.outline_location,str(self.bbox))
  336. class ParseUtils():
  337. @staticmethod
  338. def getFontinfo(_page):
  339. for _obj in _page._objs:
  340. if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
  341. for textline in _obj._objs:
  342. done = False
  343. for lchar in textline._objs:
  344. if isinstance(lchar,(LTChar)):
  345. _obj.fontname = lchar.fontname
  346. _obj.fontsize = lchar.size
  347. done = True
  348. break
  349. if done:
  350. break
  351. @staticmethod
  352. def recognize_sentences(list_textbox,filter_objs,page_bbox,page_no,remove_space=True):
  353. list_textbox.sort(key=lambda x:x.bbox[0])
  354. list_textbox.sort(key=lambda x:x.bbox[3],reverse=True)
  355. cluster_textbox = []
  356. for _textbox in list_textbox:
  357. if _textbox in filter_objs:
  358. continue
  359. _find = False
  360. for _ct in cluster_textbox:
  361. if abs(_ct["y"]-_textbox.bbox[1])<5:
  362. _find = True
  363. _ct["textbox"].append(_textbox)
  364. if not _find:
  365. cluster_textbox.append({"y":_textbox.bbox[1],"textbox":[_textbox]})
  366. cluster_textbox.sort(key=lambda x:x["y"],reverse=True)
  367. list_sentences = []
  368. for _line in cluster_textbox:
  369. _textboxs = _line["textbox"]
  370. _textboxs.sort(key=lambda x:x.bbox[0])
  371. _linetext = _textboxs[0].get_text()
  372. for _i in range(1,len(_textboxs)):
  373. if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[0])>30:
  374. if _linetext[-1] not in (",",",","。",".","、",";"):
  375. _linetext += "=,="
  376. _linetext += _textboxs[_i].get_text()
  377. _linetext = re.sub("[\s\r\n]","",_linetext)
  378. _bbox = (_textboxs[0].bbox[0],_textboxs[0].bbox[1],_textboxs[-1].bbox[2],_textboxs[-1].bbox[3])
  379. _title = None
  380. _pattern_groups = None
  381. title_text = ""
  382. if not _title:
  383. _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text())
  384. if _groups:
  385. _title = _groups[0][0]
  386. title_text = _groups[0][1]
  387. _pattern_groups = _groups
  388. if not _title:
  389. _groups = ParseUtils.find_title_by_pattern(_linetext)
  390. if _groups:
  391. _title = _groups[0][0]
  392. title_text = _groups[0][1]
  393. _pattern_groups = _groups
  394. if not _title:
  395. _title = ParseUtils.rec_incenter(_bbox,page_bbox)
  396. title_degree = 2
  397. if not _title:
  398. _linetext = _linetext.replace("=,=",",")
  399. else:
  400. _linetext = _linetext.replace("=,=","")
  401. title_degree = int(_title.split("_")[1])
  402. #页码
  403. if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$",_linetext) is not None:
  404. continue
  405. if _linetext=="" or re.search("^,+$",_linetext) is not None:
  406. continue
  407. is_outline = False
  408. outline_location = -1
  409. _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$",_linetext)
  410. if _search is not None:
  411. is_outline = True
  412. _linetext = _search.group("text")
  413. outline_location = int(_search.group("nums"))
  414. list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no))
  415. # for _sen in list_sentences:
  416. # print(_sen.__dict__)
  417. return list_sentences
  418. @staticmethod
  419. def find_title_by_pattern(_text,_pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
  420. "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
  421. "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
  422. "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]))|" \
  423. "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]))|" \
  424. "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]))|" \
  425. "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\..、\s\-]))|" \
  426. "(?P<title_15>^(?P<title_15_index_0_0>(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
  427. "(?P<title_17>^(?P<title_17_index_0_0>(?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>)))|"
  428. "(?P<title_19>^(?P<title_19_index_0_0>(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))|" \
  429. ):
  430. _se = re.search(_pattern,_text)
  431. groups = []
  432. if _se is not None:
  433. _gd = _se.groupdict()
  434. for k,v in _gd.items():
  435. if v is not None:
  436. groups.append((k,v))
  437. if len(groups):
  438. groups.sort(key=lambda x:x[0])
  439. return groups
  440. return None
  441. @staticmethod
  442. def rec_incenter(o_bbox,p_bbox):
  443. p_width = p_bbox[2]-p_bbox[0]
  444. l_space = (o_bbox[0]-p_bbox[0])/p_width
  445. r_space = (p_bbox[2]-o_bbox[2])/p_width
  446. if abs((l_space-r_space))<0.1 and l_space>0.2:
  447. return "title_2"
  448. @staticmethod
  449. def is_first_title(_title):
  450. if _title is None:
  451. return False
  452. if re.search("^\d+$",_title) is not None:
  453. if int(_title)==1:
  454. return True
  455. return False
  456. if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
  457. if _title=="一":
  458. return True
  459. return False
  460. if re.search("^[a-z]+$",_title) is not None:
  461. if _title=="a":
  462. return True
  463. return False
  464. if re.search("^[A-Z]+$",_title) is not None:
  465. if _title=="A":
  466. return True
  467. return False
  468. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
  469. if _title=="Ⅰ":
  470. return True
  471. return False
  472. return False
  473. @staticmethod
  474. def get_next_title(_title):
  475. if re.search("^\d+$",_title) is not None:
  476. return str(int(_title)+1)
  477. if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
  478. _next_title = ParseUtils.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
  479. _next_title = list(_next_title)
  480. _next_title.reverse()
  481. if _next_title[-1]!="十":
  482. if len(_next_title)>=2:
  483. _next_title.insert(-1,'十')
  484. if len(_next_title)>=4:
  485. _next_title.insert(-3,'百')
  486. if _title[0]=="十":
  487. if _next_title=="十":
  488. _next_title = ["二","十"]
  489. _next_title.insert(0,"十")
  490. _next_title = "".join(_next_title)
  491. return _next_title
  492. if re.search("^[a-z]+$",_title) is not None:
  493. _next_title = ParseUtils.make_increase([chr(i+ord('a')) for i in range(26)],_title)
  494. _next_title = list(_next_title)
  495. _next_title.reverse()
  496. return "".join(_next_title)
  497. if re.search("^[A-Z]+$",_title) is not None:
  498. _next_title = ParseUtils.make_increase([chr(i+ord('A')) for i in range(26)],_title)
  499. _next_title = list(_next_title)
  500. _next_title.reverse()
  501. return "".join(_next_title)
  502. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
  503. _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
  504. _index = _sort.index(_title)
  505. if _index<len(_sort)-1:
  506. return _sort[_index+1]
  507. return None
  508. @staticmethod
  509. def make_increase(_sort,_title,_add=1):
  510. if len(_title)==0 and _add==0:
  511. return ""
  512. if len(_title)==0 and _add==1:
  513. return _sort[0]
  514. _index = _sort.index(_title[-1])
  515. next_index = (_index+_add)%len(_sort)
  516. next_chr = _sort[next_index]
  517. if _index==len(_sort)-1:
  518. _add = 1
  519. else:
  520. _add = 0
  521. return next_chr+ParseUtils.make_increase(_sort,_title[:-1],_add)
  522. @staticmethod
  523. def rec_serial(_text,o_bbox,p_bbox,fontname,_pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
  524. "(?P<title_2>^\d+[\.、\s])|" \
  525. "(?P<title_3>^\d+\.\d+[\.、\s])|" \
  526. "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
  527. "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
  528. #todo :recog the serial of the sentence
  529. _se = re.search(_pattern,_text)
  530. if _se is not None:
  531. _gd = _se.groupdict()
  532. for k,v in _gd.items():
  533. if v is not None:
  534. return k
  535. return None
  536. if __name__ == '__main__':
  537. document = ParseDocument('file/1623230459239.pdf')
  538. # import pdfplumber
  539. # import re
  540. #
  541. # path = '关于将朝阳区建设为全球一流中心城区的课题研究.pdf'
  542. # pdf = pdfplumber.open(path)
  543. #
  544. # _index = 0
  545. # for page in pdf.pages:
  546. # _index += 1
  547. # # print(page.extract_text())
  548. # if _index==10:
  549. # page.extract_tables()
  550. # # print(page.edges)
  551. # else:
  552. # continue
  553. #
  554. # for pdf_table in page.extract_tables():
  555. # table = []
  556. # cells = []
  557. # for row in pdf_table:
  558. # if not any(row):
  559. # # 如果一行全为空,则视为一条记录结束
  560. # if any(cells):
  561. # table.append(cells)
  562. # cells = []
  563. # elif all(row):
  564. # # 如果一行全不为空,则本条为新行,上一条结束
  565. # if any(cells):
  566. # table.append(cells)
  567. # cells = []
  568. # table.append(row)
  569. # else:
  570. # if len(cells) == 0:
  571. # cells = row
  572. # else:
  573. # for i in range(len(row)):
  574. # if row[i] is not None:
  575. # cells[i] = row[i] if cells[i] is None else cells[i] + row[i]
  576. # for row in table:
  577. # print([re.sub('\s+', '', cell) if cell is not None else None for cell in row])
  578. # print('---------- 分割线 ----------')
  579. #
  580. # pdf.close()