html_2_kvtree.py 74 KB


  1. #coding:utf8
  2. from bs4 import BeautifulSoup
  3. import json
  4. import re
  5. import traceback
  6. import logging
  7. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  8. logger = logging.getLogger(__name__)
  9. logger.setLevel(logging.INFO)
  10. from BiddingKG.dl.interface.Preprocessing import tableToText
  11. from uuid import uuid4
  12. def log(msg):
  13. '''
  14. @summary:打印信息
  15. '''
  16. logger.info(msg)
  17. class DotDict(dict):
  18. def __getattr__(self,name):
  19. try:
  20. return self[name]
  21. except KeyError:
  22. raise AttributeError("No attribute '%s'" % name)
  23. def __setattr__(self,name,value):
  24. self[name] = value
  25. def get_tables(soup,dict_table = None):
  26. is_first = False
  27. if dict_table is None:
  28. dict_table = {"children":[]}
  29. is_first = True
  30. if soup and soup.name:
  31. childs = soup.contents
  32. else:
  33. childs = []
  34. # tr+tbody
  35. _flag = False
  36. if len(childs)>=2:
  37. if childs[0].name=="tr" and childs[1].name=="tbody":
  38. childs[1].insert(0,copy.copy(childs[0]))
  39. childs[0].decompose()
  40. _flag = True
  41. childs_bak = childs
  42. # tbody+tbody
  43. _flag = False
  44. if soup and soup.name:
  45. childs = soup.find_all("tbody",recursive=False)
  46. if len(childs)>=2:
  47. if childs[0].name=="tbody" and childs[1].name=="tbody":
  48. child0_tr = childs[0].find_all("tr",recursive=False)
  49. has_td_count = 0
  50. tr_line = None
  51. for tr in child0_tr:
  52. if len(tr.find_all("td",recursive=False))>0:
  53. has_td_count += 1
  54. tr_line = tr
  55. if has_td_count==1:
  56. childs[1].insert(0,copy.copy(tr_line))
  57. childs[0].decompose()
  58. _flag = True
  59. childs = childs_bak
  60. for child in childs:
  61. _d = {"children":[]}
  62. if child.name in ("table","tbody"):
  63. if len(child.find_all("tr",recursive=False))>0:
  64. # _d["table"] = str(child)
  65. _d["table"] = child
  66. dict_table["children"].append(_d)
  67. child_dict_table = get_tables(child,_d)
  68. if is_first:
  69. if soup.name in ("table","tbody"):
  70. if not _flag:
  71. if len(soup.find_all("tr",recursive=False))>0:
  72. # dict_table["table"] = str(soup)
  73. dict_table["table"] = soup
  74. dict_table = squeeze_tables(dict_table)
  75. return dict_table
  76. def squeeze_tables(dict_table):
  77. _i = -1
  78. new_children = []
  79. for child in dict_table["children"]:
  80. _i += 1
  81. child_table = squeeze_tables(child)
  82. if child_table is not None:
  83. new_children.append(child_table)
  84. if dict_table.get("table") is not None:
  85. if len(new_children)>0:
  86. dict_table["children"] = new_children
  87. else:
  88. del dict_table["children"]
  89. return dict_table
  90. if len(new_children)==1:
  91. return new_children[0]
  92. if len(new_children)>1:
  93. dict_table["children"] = new_children
  94. return dict_table
  95. return None
  96. def table_to_tree(soup,json_obj=None):
  97. if json_obj is None:
  98. json_obj = DotDict({"tag": "table","children":[]})
  99. dict_table = get_tables(soup)
  100. children = dict_table.get("children",[])
  101. for child in children:
  102. _d = DotDict({"tag": "table","children":[]})
  103. json_obj["children"].append(_d)
  104. table = child.get("table")
  105. if table is not None:
  106. table_id = str(uuid4())
  107. table_to_tree(table,_d)
  108. table = dict_table.get("table")
  109. if table is not None:
  110. table_id = str(uuid4())
  111. json_obj["table_id"] = table_id
  112. soup, kv_list, text = tableToText(table,return_kv=True)
  113. _flag = False
  114. if soup and soup.name:
  115. if soup.contents:
  116. _flag = True
  117. soup.contents[0].insert_before(table_id)
  118. if not _flag:
  119. soup.insert_before(table_id)
  120. json_obj["text"] = text
  121. json_obj["kv"] = kv_list
  122. for _d in kv_list:
  123. _d["position"] = {"key_begin_sentence":0,
  124. "key_begin_sentence_start":_d.get("key_sen_index",0),
  125. "key_end_sentence":0,
  126. "key_end_sentence_end":_d.get("key_sen_index",0)+len(_d.get("key","")),
  127. "value_begin_sentence":0,
  128. "value_begin_sentence_start":_d.get("value_sen_index",0),
  129. "value_end_sentence":0,
  130. "value_end_sentence_end":_d.get("value_sen_index",0)+len(_d.get("value",""))
  131. }
  132. if "key_sen_index" in _d:
  133. _d.pop("key_sen_index")
  134. if "value_sen_index" in _d:
  135. _d.pop("value_sen_index")
  136. return json_obj
  137. def update_table_position(table,sentence_index):
  138. def get_table_idx_lengths(list_table_id,index):
  139. _length = 0
  140. for _d in list_table_id:
  141. table_id = _d.get("table_id")
  142. idx = _d.get("idx",-1)
  143. if idx>=0 and _idx<=index:
  144. _length += len(table_id)
  145. return _length
  146. def get_sentence_index(list_sent_span,idx):
  147. list_sent_span.sort(key=lambda x:x[0])
  148. for _i in range(len(list_sent_span)):
  149. if list_sent_span[_i][0]<=idx and idx<=list_sent_span[_i][1]:
  150. return _i
  151. return 0
  152. def get_list_tables(table,list_table=[]):
  153. table_id = table.get("table_id")
  154. if table_id:
  155. list_table.append(table)
  156. childs = table.get("children",[])
  157. for child in childs:
  158. get_list_tables(child,list_table)
  159. return list_table
  160. tables = get_list_tables(table)
  161. if tables:
  162. list_table_id = []
  163. text = tables[0].get("text","")
  164. for table in tables:
  165. table_id = table.get("table_id")
  166. if table_id:
  167. _idx = text.find(table_id)
  168. list_table_id.append({"table_id":table_id,"idx":_idx})
  169. if _idx>=0:
  170. kv_list = table.get("kv",[])
  171. for _d in kv_list:
  172. _d["position"]["key_begin_sentence_start"] += _idx
  173. _d["position"]["key_end_sentence_end"] += _idx
  174. _d["position"]["value_begin_sentence_start"] += _idx
  175. _d["position"]["value_end_sentence_end"] += _idx
  176. # remove table_id
  177. for table in tables:
  178. table_id = table.get("table_id")
  179. if table_id:
  180. kv_list = table.get("kv",[])
  181. for _d in kv_list:
  182. _length = get_table_idx_lengths(list_table_id,_d["position"]["key_begin_sentence_start"])
  183. _d["position"]["key_begin_sentence_start"] -= _length
  184. _length = get_table_idx_lengths(list_table_id,_d["position"]["key_end_sentence_end"])
  185. _d["position"]["key_end_sentence_end"] -= _length
  186. _length = get_table_idx_lengths(list_table_id,_d["position"]["value_begin_sentence_start"])
  187. _d["position"]["value_begin_sentence_start"] -= _length
  188. _length = get_table_idx_lengths(list_table_id,_d["position"]["value_end_sentence_end"])
  189. _d["position"]["value_end_sentence_end"] -= _length
  190. for table in tables:
  191. if table.get("table_id"):
  192. text = table.get("text","")
  193. for _d in list_table_id:
  194. table_id = _d.get("table_id")
  195. text = text.replace(table_id,"")
  196. table["text"] = text
  197. # split sentence
  198. text = tables[0].get("text","")
  199. list_sentence = str(text).split("。")
  200. list_sent_span = []
  201. _begin = 0
  202. for _i in range(len(list_sentence)):
  203. list_sentence[_i] += "。"
  204. _end = _begin+len(list_sentence[_i])
  205. list_sent_span.append([_begin,_end])
  206. _begin = _end
  207. tables[0]["sentences"] = list_sentence
  208. for table in tables:
  209. kv_list = table.get("kv",[])
  210. for _d in kv_list:
  211. key_begin_sentence = get_sentence_index(list_sent_span,_d["position"]["key_begin_sentence_start"])
  212. _d["position"]["key_begin_sentence"] = key_begin_sentence+sentence_index
  213. key_end_sentence = get_sentence_index(list_sent_span,_d["position"]["key_end_sentence_end"])
  214. _d["position"]["key_end_sentence"] = key_end_sentence+sentence_index
  215. value_begin_sentence = get_sentence_index(list_sent_span,_d["position"]["value_begin_sentence_start"])
  216. _d["position"]["value_begin_sentence"] = value_begin_sentence+sentence_index
  217. value_end_sentence = get_sentence_index(list_sent_span,_d["position"]["value_end_sentence_end"])
  218. _d["position"]["value_end_sentence"] = value_end_sentence+sentence_index
  219. return sentence_index + len(list_sentence)
  220. return sentence_index
  221. def tree_reposition(tree,sentence_index=None):
  222. if sentence_index is None:
  223. sentence_index = 0
  224. wordOffset_begin = 0
  225. wordOffset_end = 0
  226. for obj in tree:
  227. is_table = True if obj.get("tag","")=="table" else False
  228. if not is_table:
  229. sentence_index += 1
  230. obj["sentence_index"] = sentence_index
  231. obj["sentences"] = [obj.get("text","")]
  232. for _t in obj["sentences"]:
  233. wordOffset_end += len(_t)
  234. obj["wordOffset_begin"] = wordOffset_begin
  235. obj["wordOffset_end"] = wordOffset_end
  236. wordOffset_begin = wordOffset_end
  237. else:
  238. sentence_index += 1
  239. obj["sentence_index"] = sentence_index
  240. obj["sentence_index_start"] = sentence_index
  241. obj["sentences"] = [obj.get("text","")]
  242. sentence_index_end = update_table_position(obj,sentence_index)
  243. obj["sentence_index_end"] = sentence_index_end
  244. sentence_index = sentence_index_end
  245. for _t in obj["sentences"]:
  246. wordOffset_end += len(_t)
  247. obj["wordOffset_begin"] = wordOffset_begin
  248. obj["wordOffset_end"] = wordOffset_end
  249. wordOffset_begin = wordOffset_end
  250. # 递归地将 DOM 转换为 JSON
  251. def dom_to_tree(node):
  252. if node.name: # 如果是标签节点
  253. json_obj = DotDict({"tag": node.name})
  254. if node.attrs:
  255. json_obj["attributes"] = node.attrs
  256. is_table = False
  257. if node.name in ("table","tbody"):
  258. json_obj = table_to_tree(node)
  259. is_table = True
  260. if not is_table:
  261. children = []
  262. for child in node.contents:
  263. _child = dom_to_tree(child)
  264. if _child is not None:
  265. children.append(_child)
  266. if children:
  267. json_obj["children"] = children
  268. json_obj["name"] = json_obj.get("tag")
  269. return json_obj
  270. elif node.string and node.string.strip(): # 如果是纯文本节点
  271. _text = node.string.strip()
  272. _text = re.sub('\xa0','',_text)
  273. list_text = re.split("\s",_text)
  274. _text = ""
  275. for _t in list_text:
  276. if len(_t)<3:
  277. if len(_t)>0:
  278. _text += _t
  279. else:
  280. _text += _t+" "
  281. _text = _text.strip()
  282. return DotDict({"tag":"text","name":"text","text": _text})
  283. return None # 忽略空白字符
  284. def tree_pop_parent(tree):
  285. if isinstance(tree,list):
  286. for child in tree:
  287. tree_pop_parent(child)
  288. if isinstance(tree,dict):
  289. if "parent" in tree:
  290. del tree["parent"]
  291. for child in tree.get("children",[]):
  292. tree_pop_parent(child)
  293. def html_to_tree(html_content):
  294. # 使用 BeautifulSoup 解析 HTML
  295. soup = BeautifulSoup(html_content, "lxml")
  296. dom_tree = dom_to_tree(soup)
  297. extract_kv_from_tree(dom_tree)
  298. list_objs = get_outobjs_from_tree(dom_tree)
  299. tree_reposition(list_objs)
  300. return dom_tree
  301. def print_tree(dom_tree):
  302. # 转换为 JSON 格式
  303. tree_pop_parent(dom_tree)
  304. json_output = json.dumps(dom_tree,ensure_ascii=False, indent=2)
  305. # kv_pattern = "\s*(?P<key>.{,10})[::]\s*(?P<value>[^::。,()]+?)(\s+|$|;|;)(?![\u4e00-\u9fa5]+:)"
  306. kv_pattern = r"(?P<key>[\u4e00-\u9fa5]+):\s*(?P<value>[^\s,。();;]+)"
  307. def get_kv_pattern():
  308. import re
  309. text = """
  310. name: John age: 30 note: invalid;
  311. """
  312. # 正则模式
  313. kv_pattern = r"(?P<key>[a-zA-Z]+)[::](?P<value>.+(?!.*[::]))"
  314. # 提取匹配
  315. matches = re.findall(kv_pattern, text)
  316. # 打印结果
  317. for match in matches:
  318. key, value = match
  319. print("{%s}: {%s}"%(key,value))
  320. def extract_kv_from_sentence(sentence):
  321. list_kv = []
  322. _iter = re.finditer("[::]", sentence)
  323. if _iter:
  324. list_span = []
  325. for iter in _iter:
  326. list_span.append(iter.span())
  327. if len(list_span)==1:
  328. _begin,_end = list_span[0]
  329. if _begin<20 and _end<len(sentence)-1:
  330. _d = DotDict({"key":sentence[0:_begin],"value":sentence[_end:]})
  331. _d["position"] = {"key_begin_sentence":0,
  332. "key_begin_sentence_start":0,
  333. "key_end_sentence":0,
  334. "key_end_sentence_end":_begin,
  335. "value_begin_sentence":0,
  336. "value_begin_sentence_start":_end,
  337. "value_end_sentence":0,
  338. "value_end_sentence_end":len(sentence)
  339. }
  340. list_kv.append(_d)
  341. else:
  342. _begin = 0
  343. _end = len(sentence)-1
  344. iter = re.search(kv_pattern,sentence[_begin:_end])
  345. if iter is not None:
  346. _d = DotDict({})
  347. _d["key"] = iter.group("key")
  348. _d["value"] = iter.group("value")
  349. _d["position"] = {"key_begin_sentence":0,
  350. "key_begin_sentence_start":iter.span("key")[0],
  351. "key_end_sentence":0,
  352. "key_end_sentence_end":iter.span("key")[0]+len(_d.get("key","")),
  353. "value_begin_sentence":0,
  354. "value_begin_sentence_start":iter.span("value")[0],
  355. "value_end_sentence":0,
  356. "value_end_sentence_end":iter.span("value")[0]+len(_d.get("value",""))
  357. }
  358. list_kv.append(_d)
  359. elif len(list_span)>1:
  360. _begin,_end = list_span[0]
  361. if _begin<20 and len(sentence)>100:
  362. _d = DotDict({"key":sentence[0:_begin],"value":sentence[_end:]})
  363. _d["position"] = {"key_begin_sentence":0,
  364. "key_begin_sentence_start":0,
  365. "key_end_sentence":0,
  366. "key_end_sentence_end":_begin,
  367. "value_begin_sentence":0,
  368. "value_begin_sentence_start":_end,
  369. "value_end_sentence":0,
  370. "value_end_sentence_end":len(sentence)
  371. }
  372. list_kv.append(_d)
  373. else:
  374. _begin = 0
  375. for _i in range(len(list_span)-1):
  376. _end = list_span[_i+1][0]
  377. iter = re.search(kv_pattern,sentence[_begin:_end])
  378. _begin = list_span[_i][1]
  379. if iter is not None:
  380. _d = DotDict({})
  381. _d["key"] = iter.group("key")
  382. _d["value"] = iter.group("value")
  383. _d["position"] = {"key_begin_sentence":0,
  384. "key_begin_sentence_start":iter.span("key")[0],
  385. "key_end_sentence":0,
  386. "key_end_sentence_end":iter.span("key")[0]+len(_d.get("key","")),
  387. "value_begin_sentence":0,
  388. "value_begin_sentence_start":iter.span("value")[0],
  389. "value_end_sentence":0,
  390. "value_end_sentence_end":iter.span("value")[0]+len(_d.get("value",""))
  391. }
  392. list_kv.append(_d)
  393. _begin = list_span[-2][1]
  394. _end = len(sentence)
  395. iter = re.search(kv_pattern,sentence[_begin:_end])
  396. if iter is not None:
  397. _d = DotDict({})
  398. _d["key"] = iter.group("key")
  399. _d["value"] = iter.group("value")
  400. _d["position"] = {"key_begin_sentence":0,
  401. "key_begin_sentence_start":iter.span("key")[0],
  402. "key_end_sentence":0,
  403. "key_end_sentence_end":iter.span("key")[0]+len(_d.get("key","")),
  404. "value_begin_sentence":0,
  405. "value_begin_sentence_start":iter.span("value")[0],
  406. "value_end_sentence":0,
  407. "value_end_sentence_end":iter.span("value")[0]+len(_d.get("value",""))
  408. }
  409. list_kv.append(_d)
  410. # for iter in _iter:
  411. # _d = DotDict({})
  412. # _d["key"] = iter.group("key")
  413. # _d["value"] = iter.group("value")
  414. # _d["key_span"] = iter.span("key")
  415. # _d["value_span"] = iter.span("value")
  416. # list_kv.append(_d)
  417. return list_kv
  418. def extract_kv_from_node(node):
  419. list_kv = []
  420. list_text = []
  421. childs = node.get("children",[])
  422. _text = ""
  423. has_br = False
  424. if childs:
  425. for child in childs:
  426. node_name = child.get("tag","")
  427. child_text = child.get("text")
  428. if node_name=="br":
  429. list_text.append([])
  430. has_br = True
  431. if child_text:
  432. if len(list_text)==0:
  433. list_text.append([])
  434. list_text[-1].append(child)
  435. node["kv"] = []
  436. if has_br:
  437. new_children = []
  438. for texts in list_text:
  439. if texts:
  440. _text = "".join([a.get("text") for a in texts])
  441. tag = texts[0]
  442. list_kv = extract_kv_from_sentence(_text)
  443. _n = DotDict({"tag":tag,"name":tag,"text":_text,"children":[],"kv":list_kv})
  444. new_children.append(_n)
  445. node["children"] = new_children
  446. else:
  447. for texts in list_text:
  448. _text = "".join([a.get("text") for a in texts])
  449. if _text:
  450. list_kv = extract_kv_from_sentence(_text)
  451. node["kv"].extend(list_kv)
  452. else:
  453. _text = node.get("text")
  454. if _text:
  455. list_kv = extract_kv_from_sentence(_text)
  456. node["kv"] = list_kv
  457. return list_kv
  458. def get_child_text(node):
  459. _text = node.get("text","")
  460. for child in node.get("children",[]):
  461. _text += get_child_text(child)
  462. return _text
  463. def extract_kv_from_tree(tree):
  464. if isinstance(tree,list):
  465. _count = 0
  466. has_table = False
  467. for child in tree:
  468. _c,_t = extract_kv_from_tree(child)
  469. _count += _c
  470. if _t:
  471. has_table = _t
  472. return _count,has_table
  473. if isinstance(tree,dict):
  474. if tree.get("tag","")!="table":
  475. childs = tree.get("children",[])
  476. if len(childs)>0:
  477. _count = 0
  478. has_table = False
  479. child_has_p_div = False
  480. child_has_br = False
  481. for child in childs:
  482. _c,_t = extract_kv_from_tree(child)
  483. _count += _c
  484. if _t:
  485. has_table = _t
  486. if child.get("tag","") in ("p","div"):
  487. child_has_p_div = True
  488. if child.get("tag","")=="br":
  489. child_has_br = True
  490. if _count==0:
  491. if not has_table and not child_has_p_div and not child_has_br:
  492. _text = get_child_text(tree)
  493. if "children" in tree:
  494. del tree["children"]
  495. tree["text"] = _text
  496. list_kv = extract_kv_from_node(tree)
  497. _count = len(list_kv)
  498. return _count,has_table
  499. if tree.get("tag","") in ("p","div") and not has_table and not child_has_p_div:
  500. if not child_has_br:
  501. _text = get_child_text(tree)
  502. tree["text"] = _text
  503. p_list_kv = extract_kv_from_node(tree)
  504. if len(p_list_kv)>=_count:
  505. if "children" in tree:
  506. del tree["children"]
  507. else:
  508. tree["text"] = ""
  509. return len(p_list_kv),has_table
  510. return _count,has_table
  511. else:
  512. list_kv = extract_kv_from_node(tree)
  513. return len(list_kv),False
  514. else:
  515. return len(tree.get("kv",[])),True
  516. return 0,False
  517. def update_kv_span(list_kv,append_length):
  518. for _d in list_kv:
  519. _d["position"] = {"key_begin_sentence":0,
  520. "key_begin_sentence_start":_d.get("key_sen_index",0),
  521. "key_end_sentence":0,
  522. "key_end_sentence_end":_d.get("key_sen_index",0)+len(_d.get("key","")),
  523. "value_begin_sentence":0,
  524. "value_begin_sentence_start":_d.get("value_sen_index",0),
  525. "value_end_sentence":0,
  526. "value_end_sentence_end":_d.get("value_sen_index",0)+len(_d.get("value",""))
  527. }
  528. _d["position"]["key_begin_sentence_start"] += append_length
  529. _d["position"]["key_end_sentence_end"] += append_length
  530. _d["position"]["value_begin_sentence_start"] += append_length
  531. _d["position"]["value_end_sentence_end"] += append_length
  532. def get_outobjs_from_tree(tree,list_outobjs=None):
  533. is_first = False
  534. if list_outobjs is None:
  535. list_outobjs = []
  536. is_first = True
  537. if isinstance(tree,list):
  538. for child in tree:
  539. get_outobjs_from_tree(child,list_outobjs)
  540. if isinstance(tree,dict):
  541. childs = tree.get("children",[])
  542. _text = tree.get("text","")
  543. is_table = True if tree.get("tag","")=="table" else False
  544. if is_table:
  545. list_outobjs.append(tree)
  546. else:
  547. if _text!="":
  548. tree.name = tree.tag
  549. list_outobjs.append(tree)
  550. for child in childs:
  551. get_outobjs_from_tree(child,list_outobjs)
  552. return list_outobjs
  553. def standard_title_context(_title_context):
  554. return _title_context.replace("(","(").replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".").replace(".",".")
  555. def standard_product(sentence):
  556. return sentence.replace("(","(").replace(")",")")
  557. import Levenshtein
  558. import copy
  559. def jaccard_score(source,target):
  560. source_set = set([s for s in source])
  561. target_set = set([s for s in target])
  562. if len(source_set)==0 or len(target_set)==0:
  563. return 0
  564. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  565. def judge_pur_chinese(keyword):
  566. """
  567. 中文字符的编码范围为: u'\u4e00' -- u'\u9fff:只要在此范围内就可以判断为中文字符串
  568. @param keyword:
  569. @return:
  570. """
  571. # 定义一个需要删除的标点符号字符串列表
  572. remove_chars = '[·’!"\#$%&\'()#!()*+,-./:;<=>?\@,:?¥★、….>【】[]《》?“”‘’\[\\]^_`{|}~]+'
  573. # 利用re.sub来删除中文字符串中的标点符号
  574. strings = re.sub(remove_chars, "", keyword) # 将keyword中文字符串中remove_chars中包含的标点符号替换为空字符串
  575. for ch in strings:
  576. if u'\u4e00' <= ch <= u'\u9fff':
  577. pass
  578. else:
  579. return False
  580. return True
  581. def is_similar(source,target,_radio=None):
  582. source = str(source).lower()
  583. target = str(target).lower()
  584. max_len = max(len(source),len(target))
  585. min_len = min(len(source),len(target))
  586. min_ratio = 90
  587. if min_len>=3:
  588. min_ratio = 87
  589. if min_len>=5:
  590. min_ratio = 85
  591. if _radio is not None:
  592. min_ratio = _radio
  593. # dis_len = abs(len(source)-len(target))
  594. # min_dis = min(max_len*0.2,4)
  595. if min_len==0 and max_len>0:
  596. return False
  597. if max_len<=2:
  598. if source==target:
  599. return True
  600. if min_len<2:
  601. return False
  602. #判断相似度
  603. similar = Levenshtein.ratio(source,target)*100
  604. if similar>=min_ratio:
  605. log("%s and %s similar_jaro %d"%(source,target,similar))
  606. return True
  607. similar_jaro = Levenshtein.jaro(source,target)
  608. if similar_jaro*100>=min_ratio:
  609. log("%s and %s similar_jaro %d"%(source,target,similar_jaro*100))
  610. return True
  611. similar_jarow = Levenshtein.jaro_winkler(source,target)
  612. if similar_jarow*100>=min_ratio:
  613. log("%s and %s similar_jaro %d"%(source,target,similar_jarow*100))
  614. return True
  615. if min_len>=5:
  616. if len(source)==max_len and str(source).find(target)>=0:
  617. return True
  618. elif len(target)==max_len and target.find(source)>=0:
  619. return True
  620. elif jaccard_score(source, target)==1 and judge_pur_chinese(source) and judge_pur_chinese(target):
  621. return True
  622. return False
  623. end_pattern = "商务要求|评分标准|商务条件|商务条件"
  624. _param_pattern = "(产品|技术|清单|配置|参数|具体|明细|项目|招标|货物|服务|规格|工作|具体)[及和与]?(指标|配置|条件|要求|参数|需求|规格|条款|名称及要求)|配置清单|(质量|技术).{,10}要求|验收标准|^(参数|功能)$"
  625. meter_pattern = "[><≤≥±]\d+|\d+(?:[μucmkK微毫千]?[米升LlgGmMΩ]|摄氏度|英寸|度|天|VA|dB|bpm|rpm|kPa|mol|cmH20|%|°|Mpa|Hz|K?HZ|℃|W|min|[*×xX])|[*×xX]\d+|/min|\ds[^a-zA-Z]|GB.{,20}标准|PVC|PP|角度|容积|色彩|自动|流量|外径|轴位|折射率|帧率|柱镜|振幅|磁场|镜片|防漏|强度|允差|心率|倍数|瞳距|底座|色泽|噪音|间距|材质|材料|表面|频率|阻抗|浓度|兼容|防尘|防水|内径|实时|一次性|误差|性能|距离|精确|温度|超温|范围|跟踪|对比度|亮度|[横纵]向|均压|负压|正压|可调|设定值|功能|检测|高度|厚度|宽度|深度|[单双多]通道|效果|指数|模式|尺寸|重量|峰值|谷值|容量|寿命|稳定性|高温|信号|电源|电流|转换率|效率|释放量|转速|离心力|向心力|弯曲|电压|功率|气量|国标|标准协议|灵敏度|最大值|最小值|耐磨|波形|高压|性强|工艺|光源|低压|压力|压强|速度|湿度|重量|毛重|[MLX大中小]+码|净重|颜色|[红橙黄绿青蓝紫]色|不锈钢|输入|输出|噪声|认证|配置"
  626. not_meter_pattern = "投标报价|中标金额|商务部分|公章|分值构成|业绩|详见|联系人|联系电话|合同价|金额|采购预算|资金来源|费用|质疑|评审因素|评审标准|商务资信|商务评分|专家论证意见|评标方法|代理服务费|售后服务|评分类型|评分项目|预算金额|得\d+分|项目金额|详见招标文件|乙方"
  627. def getTrs(tbody):
  628. #获取所有的tr
  629. trs = []
  630. if tbody.name=="table":
  631. body = tbody.find("tbody",recursive=False)
  632. if body is not None:
  633. tbody = body
  634. objs = tbody.find_all(recursive=False)
  635. for obj in objs:
  636. if obj.name=="tr":
  637. trs.append(obj)
  638. if obj.name=="tbody" or obj.name=="table":
  639. for tr in obj.find_all("tr",recursive=False):
  640. trs.append(tr)
  641. return trs
  642. def fixSpan(tbody):
  643. # 处理colspan, rowspan信息补全问题
  644. #trs = tbody.findChildren('tr', recursive=False)
  645. trs = getTrs(tbody)
  646. ths_len = 0
  647. ths = list()
  648. trs_set = set()
  649. #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  650. # 遍历每一个tr
  651. for indtr, tr in enumerate(trs):
  652. ths_tmp = tr.findChildren('th', recursive=False)
  653. #不补全含有表格的tr
  654. if len(tr.findChildren('table'))>0:
  655. continue
  656. if len(ths_tmp) > 0:
  657. ths_len = ths_len + len(ths_tmp)
  658. for th in ths_tmp:
  659. ths.append(th)
  660. trs_set.add(tr)
  661. # 遍历每行中的element
  662. tds = tr.findChildren(recursive=False)
  663. for indtd, td in enumerate(tds):
  664. # 若有colspan 则补全同一行下一个位置
  665. if 'colspan' in td.attrs:
  666. if str(re.sub("[^0-9]","",str(td['colspan'])))!="":
  667. col = int(re.sub("[^0-9]","",str(td['colspan'])))
  668. if col<100 and len(td.get_text())<1000:
  669. td['colspan'] = 1
  670. for i in range(1, col, 1):
  671. td.insert_after(copy.copy(td))
  672. for indtr, tr in enumerate(trs):
  673. ths_tmp = tr.findChildren('th', recursive=False)
  674. #不补全含有表格的tr
  675. if len(tr.findChildren('table'))>0:
  676. continue
  677. if len(ths_tmp) > 0:
  678. ths_len = ths_len + len(ths_tmp)
  679. for th in ths_tmp:
  680. ths.append(th)
  681. trs_set.add(tr)
  682. # 遍历每行中的element
  683. tds = tr.findChildren(recursive=False)
  684. for indtd, td in enumerate(tds):
  685. # 若有rowspan 则补全下一行同样位置
  686. if 'rowspan' in td.attrs:
  687. if str(re.sub("[^0-9]","",str(td['rowspan'])))!="":
  688. row = int(re.sub("[^0-9]","",str(td['rowspan'])))
  689. td['rowspan'] = 1
  690. for i in range(1, row, 1):
  691. # 获取下一行的所有td, 在对应的位置插入
  692. if indtr+i<len(trs):
  693. tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
  694. if len(tds1) >= (indtd) and len(tds1)>0:
  695. if indtd > 0:
  696. tds1[indtd - 1].insert_after(copy.copy(td))
  697. else:
  698. tds1[0].insert_before(copy.copy(td))
  699. elif indtd-2>0 and len(tds1) > 0 and len(tds1) == indtd - 1: # 修正某些表格最后一列没补全
  700. tds1[indtd-2].insert_after(copy.copy(td))
  701. def getTable(tbody):
  702. #trs = tbody.findChildren('tr', recursive=False)
  703. fixSpan(tbody)
  704. trs = getTrs(tbody)
  705. inner_table = []
  706. for tr in trs:
  707. tr_line = []
  708. tds = tr.findChildren(['td','th'], recursive=False)
  709. if len(tds)==0:
  710. tr_line.append([re.sub('\xa0','',tr.get_text()),0]) # 2021/12/21 修复部分表格没有td 造成数据丢失
  711. for td in tds:
  712. tr_line.append([re.sub('\xa0','',td.get_text()),0])
  713. #tr_line.append([td.get_text(),0])
  714. inner_table.append(tr_line)
  715. return inner_table
  716. def extract_products(list_data,_product,_param_pattern = "产品名称|设备材料|采购内存|标的名称|采购内容|(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体|标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)[\))的]?([、\w]{,4}名称|内容|描述)|标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$"):
  717. _product = standard_product(_product)
  718. list_result = []
  719. list_table_products = []
  720. for _data_i in range(len(list_data)):
  721. _data = list_data[_data_i]
  722. _type = _data["type"]
  723. _text = _data["text"]
  724. if _type=="table":
  725. list_table = _data["list_table"]
  726. if list_table is None:
  727. continue
  728. _check = True
  729. max_length = max([len(a) for a in list_table])
  730. min_length = min([len(a) for a in list_table])
  731. if min_length<max_length/2:
  732. continue
  733. list_head_index = []
  734. _begin_index = 0
  735. head_cell_text = ""
  736. for line_i in range(len(list_table[:2])):
  737. line = list_table[line_i]
  738. line_text = ",".join([cell[0] for cell in line])
  739. for cell_i in range(len(line)):
  740. cell = line[cell_i]
  741. cell_text = cell[0]
  742. if len(cell_text)<10 and re.search(_param_pattern,cell_text) is not None and re.search("单价|数量|预算|限价|总价|品牌|规格|型号|用途|要求|采购量",line_text) is not None:
  743. _begin_index = line_i+1
  744. list_head_index.append(cell_i)
  745. for line_i in range(len(list_table)):
  746. line = list_table[line_i]
  747. for cell_i in list_head_index:
  748. if cell_i>=len(line):
  749. continue
  750. cell = line[cell_i]
  751. cell_text = cell[0]
  752. head_cell_text += cell_text
  753. # print("===head_cell_text",head_cell_text)
  754. if re.search("招标人|采购人|项目编号|项目名称|金额|^\d+$",head_cell_text) is not None:
  755. list_head_index = []
  756. for line in list_table:
  757. line_text = ",".join([cell[0] for cell in line])
  758. for cell_i in range(len(line)):
  759. cell = line[cell_i]
  760. cell_text = cell[0]
  761. if cell_text is not None and _product is not None and len(cell_text)<len(_product)*10 and cell_text.find(_product)>=0 and re.search("单价|数量|总价|规格|品牌|型号|用途|要求|采购量",line_text) is not None:
  762. list_head_index.append(cell_i)
  763. list_head_index = list(set(list_head_index))
  764. if len(list_head_index)>0:
  765. has_number = False
  766. for cell_i in list_head_index:
  767. table_products = []
  768. for line_i in range(_begin_index,len(list_table)):
  769. line = list_table[line_i]
  770. for _i in range(len(line)):
  771. cell = line[_i]
  772. cell_text = cell[0]
  773. if re.search("^\d+$",cell_text) is not None:
  774. has_number = True
  775. if cell_i>=len(line):
  776. continue
  777. cell = line[cell_i]
  778. cell_text = cell[0]
  779. if re.search(_param_pattern,cell_text) is None or has_number:
  780. if re.search("^[\da-zA-Z]+$",cell_text) is None:
  781. table_products.append(cell_text)
  782. if len(table_products)>0:
  783. logger.debug("table products %s"%(str(table_products)))
  784. if min([len(x) for x in table_products])>0 and max([len(x) for x in table_products])<=30:
  785. if re.search("招标人|代理人|预算|数量|交货期|品牌|产地","".join(table_products)) is None:
  786. list_table_products.append(table_products)
  787. _find = False
  788. for table_products in list_table_products:
  789. for _p in table_products:
  790. if is_similar(_product,_p,90):
  791. _find = True
  792. logger.debug("similar table_products %s"%(str(table_products)))
  793. list_result = list(set([a for a in table_products if len(a)>1 and len(a)<20 and re.search("费用|预算|合计|金额|万元|运费|^其他$",a) is None]))
  794. break
  795. if not _find:
  796. for table_products in list_table_products:
  797. list_result.extend(table_products)
  798. list_result = list(set([a for a in list_result if len(a)>1 and len(a)<30 and re.search("费用|预算|合计|金额|万元|运费",a) is None]))
  799. return list_result
  800. def get_childs(childs, max_depth=None):
  801. list_data = []
  802. for _child in childs:
  803. list_data.append(_child)
  804. childs2 = _child.get("child_title",[])
  805. if len(childs2)>0 and (max_depth==None or max_depth>0):
  806. for _child2 in childs2:
  807. if max_depth != None:
  808. list_data.extend(get_childs([_child2], max_depth-1))
  809. else:
  810. list_data.extend(get_childs([_child2], None))
  811. return list_data
  812. class Html2KVTree():
  813. def __init__(self,_html,auto_merge_table=True,list_obj = []):
  814. if _html is None:
  815. _html = ""
  816. self.html = _html
  817. self.auto_merge_table = auto_merge_table
  818. if list_obj:
  819. self.list_obj = list_obj
  820. else:
  821. _tree = html_to_tree(html_content)
  822. self.list_obj = get_outobjs_from_tree(_tree)
  823. # for obj in self.list_obj:
  824. # print("obj",obj.get_text()[:20])
  825. self.tree = self.buildParsetree(self.list_obj,[],auto_merge_table)
  826. # #识别目录树
  827. # self.print_tree(self.tree,"-|")
  828. def get_soup_objs(self,soup,list_obj=None):
  829. if list_obj is None:
  830. list_obj = []
  831. childs = soup.find_all(recursive=False)
  832. for _obj in childs:
  833. childs1 = _obj.find_all(recursive=False)
  834. if len(childs1)==0 or len(_obj.get_text())<40 or _obj.name=="table":
  835. list_obj.append(_obj)
  836. elif _obj.name=="p":
  837. list_obj.append(_obj)
  838. else:
  839. self.get_soup_objs(_obj,list_obj)
  840. return list_obj
  841. def fix_tree(self,_product):
  842. products = extract_products(self.tree,_product)
  843. if len(products)>0:
  844. self.tree = self.buildParsetree(self.list_obj,products,self.auto_merge_table)
  845. def print_tree(self,tree,append="",set_tree_id=None):
  846. if set_tree_id is None:
  847. set_tree_id = set()
  848. if append=="":
  849. for t in tree:
  850. logger.debug("%s text:%s title:%s title_text:%s before:%s after%s product:%s"%("==>",t["text"][:50],t["sentence_title"],t["sentence_title_text"],t["title_before"],t["title_after"],t["has_product"]))
  851. for t in tree:
  852. _id = id(t)
  853. if _id in set_tree_id:
  854. continue
  855. set_tree_id.add(_id)
  856. logger.info("%s text:%s title:%s title_text:%s before:%s after%s product:%s kv:%s"%(append,t["text"][:50],t["sentence_title"],t["sentence_title_text"],t["title_before"],t["title_after"],t["has_product"],str(t["kv"])))
  857. childs = t["child_title"]
  858. self.print_tree(childs,append=append+"-|",set_tree_id=set_tree_id)
  859. def is_title_first(self,title):
  860. if title in ("一","1","Ⅰ","a","A"):
  861. return True
  862. return False
  863. def find_title_by_pattern(self,_text,_pattern="(^|★|▲|:|:|\s+)(?P<title_1>(?P<title_1_index_0_0>第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章册包标部.::]+))|" \
  864. "([\s★▲\*]*)(?P<title_3>(?P<title_3_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?)(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_3_index_0_2>[、章册包标部.::]+))|" \
  865. "([\s★▲\*]*)(?P<title_4>(?P<title_4_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节章册部\.::、、]+))|" \
  866. "([\s★▲\*]*)(?P<title_5>(?P<title_5_index_0_0>^)(?P<title_5_index_1_1>[一二三四五六七八九十]+)(?P<title_5_index_2_0>)[^一二三四五六七八九十节章册部\.::、])|" \
  867. "([\s★▲\*]*)(?P<title_12>(?P<title_12_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_12_index_1_1>\d{1,2})(?P<title_12_index_2_0>[\..、\s\-]?))|"\
  868. "([\s★▲\*]*)(?P<title_11>(?P<title_11_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]?))|" \
  869. "([\s★▲\*]*)(?P<title_10>(?P<title_10_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]?))|" \
  870. "([\s★▲\*]*)(?P<title_7>(?P<title_7_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?\d{1,2}[\..\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..包标::、\s\-]*))|" \
  871. "(^[\s★▲\*]*)(?P<title_6>(?P<title_6_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?包?)(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_2_0>[\..、\s\-包标]*))|" \
  872. "([\s★▲\*]*)(?P<title_15>(?P<title_15_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>[))包标\..::、]+))|" \
  873. "([\s★▲\*]+)(?P<title_17>(?P<title_17_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>[))包标\..::、]+))|" \
  874. "([\s★▲\*]*)(?P<title_19>(?P<title_19_index_0_0>[^一二三四五六七八九十\dⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]{,3}?[((]?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>[))]))"
  875. ):
  876. _se = re.search(_pattern,_text)
  877. groups = []
  878. if _se is not None:
  879. e = _se.end()
  880. if re.search('(时间|日期|编号|账号|号码|手机|价格|\w价|人民币|金额|得分|分值|总分|满分|最高得|扣|减|数量|评委)[::]?\d', _se.group(0)) or (re.search('\d[.::]?$', _se.group(0)) and re.search('^[\d年月日万元天个分秒台条A-Za-z]|^(小时)', _text[e:])):
  881. return None
  882. elif re.match('[二三四五六七八九十]\w{1,2}[市区县]|五金|四川|八疆|九龙|[一二三四五六七八九十][层天标包]', _text) and re.match('[一二三四五六七八九十]', _se.group(0)): # 289765335 排除三明市等开头作为大纲
  883. return None
  884. elif re.search('^[\u4e00-\u9fa5]+[::]', _text[:e]):
  885. return None
  886. _gd = _se.groupdict()
  887. for k,v in _gd.items():
  888. if v is not None:
  889. groups.append((k,v))
  890. if len(groups):
  891. groups.sort(key=lambda x:x[0])
  892. return groups
  893. return None
  894. def make_increase(self,_sort,_title,_add=1):
  895. if len(_title)==0 and _add==0:
  896. return ""
  897. if len(_title)==0 and _add==1:
  898. return _sort[0]
  899. _index = _sort.index(_title[-1])
  900. next_index = (_index+_add)%len(_sort)
  901. next_chr = _sort[next_index]
  902. if _index==len(_sort)-1:
  903. _add = 1
  904. else:
  905. _add = 0
  906. return next_chr+self.make_increase(_sort,_title[:-1],_add)
  907. def get_next_title(self,_title):
  908. if re.search("^\d+$",_title) is not None:
  909. return str(int(_title)+1)
  910. if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
  911. if _title[-1]=="十":
  912. return _title+"一"
  913. if _title[-1]=="百":
  914. return _title+"零一"
  915. if _title[-1]=="九":
  916. if len(_title)==1:
  917. return "十"
  918. if len(_title)==2:
  919. if _title[0]=="十":
  920. return "二十"
  921. if len(_title)==3:
  922. if _title[0]=="九":
  923. return "一百"
  924. else:
  925. _next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title[0]))
  926. return _next_title+"十"
  927. _next_title = self.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
  928. _next_title = list(_next_title)
  929. _next_title.reverse()
  930. if _next_title[-1]!="十":
  931. if len(_next_title)>=2:
  932. _next_title.insert(-1,'十')
  933. if len(_next_title)>=4:
  934. _next_title.insert(-3,'百')
  935. if _title[0]=="十":
  936. if _next_title=="十":
  937. _next_title = ["二","十"]
  938. _next_title.insert(0,"十")
  939. _next_title = "".join(_next_title)
  940. return _next_title
  941. if re.search("^[a-z]+$",_title) is not None:
  942. _next_title = self.make_increase([chr(i+ord('a')) for i in range(26)],_title)
  943. _next_title = list(_next_title)
  944. _next_title.reverse()
  945. return "".join(_next_title)
  946. if re.search("^[A-Z]+$",_title) is not None:
  947. _next_title = self.make_increase([chr(i+ord('A')) for i in range(26)],_title)
  948. _next_title = list(_next_title)
  949. _next_title.reverse()
  950. return "".join(_next_title)
  951. if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
  952. _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
  953. _index = _sort.index(_title)
  954. if _index<len(_sort)-1:
  955. return _sort[_index+1]
  956. return None
  957. def count_title_before(self,list_obj):
  958. dict_before = {}
  959. dict_sentence_count = {}
  960. illegal_sentence = set()
  961. for obj_i in range(len(list_obj)):
  962. obj = list_obj[obj_i]
  963. _type = "sentence"
  964. _text = obj.text.strip()
  965. if obj.name=="table":
  966. _type = "table"
  967. _text = str(obj)
  968. _append = False
  969. if _type=="sentence":
  970. if len(_text)>10 and len(_text)<100:
  971. if _text not in dict_sentence_count:
  972. dict_sentence_count[_text] = 0
  973. dict_sentence_count[_text] += 1
  974. if re.search("\d+页",_text) is not None:
  975. illegal_sentence.add(_text)
  976. elif len(_text)<10:
  977. if re.search("第\d+页",_text) is not None:
  978. illegal_sentence.add(_text)
  979. sentence_groups = self.find_title_by_pattern(_text[:10])
  980. if sentence_groups:
  981. # c062f53cf83401e671822003d63c1828print("sentence_groups",sentence_groups)
  982. sentence_title = sentence_groups[0][0]
  983. sentence_title_text = sentence_groups[0][1]
  984. title_index = sentence_groups[-2][1]
  985. title_before = sentence_groups[1][1].replace("(","(").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".")
  986. title_after = sentence_groups[-1][1].replace(")",")").replace(":",":").replace(":",";").replace(",",".").replace(",",".").replace("、",".")
  987. next_index = self.get_next_title(title_index)
  988. if title_before not in dict_before:
  989. dict_before[title_before] = 0
  990. dict_before[title_before] += 1
  991. for k,v in dict_sentence_count.items():
  992. if v>10:
  993. illegal_sentence.add(k)
  994. return dict_before,illegal_sentence
  995. def is_page_no(self,sentence):
  996. if len(sentence)<10:
  997. if re.search("\d+页|^\-\d+\-$",sentence) is not None:
  998. return True
  999. def block_tree(self,childs):
  1000. for child in childs:
  1001. if not child["block"]:
  1002. child["block"] = True
  1003. childs2 = child["child_title"]
  1004. self.block_tree(childs2)
  1005. def buildParsetree(self,list_obj,products=[],auto_merge_table=True,auto_append=False):
  1006. self.parseTree = None
  1007. trees = []
  1008. list_length = []
  1009. for obj in list_obj[:200]:
  1010. if obj.name!="table":
  1011. list_length.append(len(obj.text))
  1012. if len(list_length)>0:
  1013. max_length = max(list_length)
  1014. else:
  1015. max_length = 40
  1016. max_length = min(max_length,40)
  1017. logger.debug("%s:%d"%("max_length",max_length))
  1018. list_data = []
  1019. last_table_index = None
  1020. last_table_columns = None
  1021. last_table = None
  1022. dict_before,illegal_sentence = self.count_title_before(list_obj)
  1023. for obj_i in range(len(list_obj)):
  1024. obj = list_obj[obj_i]
  1025. # logger.debug("==obj %s"%obj.text[:20])
  1026. _type = "sentence"
  1027. _text = standard_product(obj.text)
  1028. if obj.name=="table":
  1029. _type = "table"
  1030. _text = standard_product(str(obj))
  1031. _append = False
  1032. sentence_title = None
  1033. sentence_title_text = None
  1034. sentence_groups = None
  1035. title_index = None
  1036. next_index = None
  1037. parent_title = None
  1038. title_before = None
  1039. title_after = None
  1040. title_next = None
  1041. childs = []
  1042. # new
  1043. sentence_index = obj.sentence_index
  1044. wordOffset_begin = obj.wordOffset_begin
  1045. wordOffset_end = obj.wordOffset_end
  1046. sentences = obj.sentences
  1047. list_kv = obj.get("kv",[])
  1048. table_id = obj.get("table_id")
  1049. list_table = None
  1050. block = False
  1051. has_product = False
  1052. position = obj.get("position",{})
  1053. if _type=="sentence":
  1054. if _text in illegal_sentence:
  1055. continue
  1056. sentence_groups = self.find_title_by_pattern(_text[:10])
  1057. if sentence_groups:
  1058. title_before = standard_title_context(sentence_groups[1][1])
  1059. title_after = sentence_groups[-1][1]
  1060. sentence_title_text = sentence_groups[0][1]
  1061. other_text = _text.replace(sentence_title_text,"")
  1062. if (title_before in dict_before and dict_before[title_before]>1) or title_after!="":
  1063. sentence_title = sentence_groups[0][0]
  1064. title_index = sentence_groups[-2][1]
  1065. next_index = self.get_next_title(title_index)
  1066. other_text = _text.replace(sentence_title_text,"")
  1067. for p in products:
  1068. if other_text.strip()==p.strip():
  1069. has_product = True
  1070. else:
  1071. _fix = False
  1072. for p in products:
  1073. if other_text.strip()==p.strip():
  1074. title_before = "=产品"
  1075. sentence_title = "title_0"
  1076. sentence_title_text = p
  1077. title_index = "0"
  1078. title_after = "产品="
  1079. next_index = "0"
  1080. _fix = True
  1081. has_product = True
  1082. break
  1083. if not _fix:
  1084. title_before = None
  1085. title_after = None
  1086. sentence_title_text = None
  1087. else:
  1088. if len(_text)<40 and re.search(_param_pattern,_text) is not None:
  1089. for p in products:
  1090. if _text.find(p)>=0:
  1091. title_before = "=产品"
  1092. sentence_title = "title_0"
  1093. sentence_title_text = p
  1094. title_index = "0"
  1095. title_after = "产品="
  1096. next_index = "0"
  1097. _fix = True
  1098. has_product = True
  1099. break
  1100. # 合并两个非标题句子 20241106 注销,由于 485441521 招标内容结束位置不对
  1101. if auto_append:
  1102. if _type=="sentence":
  1103. if sentence_title is None and len(list_data)>0 and list_data[-1]["sentence_title"] is not None and list_data[-1]["line_width"]>=max_length*0.6:
  1104. list_data[-1]["text"] += _text
  1105. list_data[-1]["line_width"] = len(_text)
  1106. update_kv_span(list_kv,len(_text))
  1107. list_data[-1]["kv"].extend(list_kv)
  1108. list_data[-1]["sentences"].extend(sentences)
  1109. _append = True
  1110. elif sentence_title is None and len(list_data)>0 and _type==list_data[-1]["type"]:
  1111. if list_data[-1]["line_width"]>=max_length*0.7:
  1112. list_data[-1]["text"] += _text
  1113. list_data[-1]["line_width"] = len(_text)
  1114. update_kv_span(list_kv,len(_text))
  1115. list_data[-1]["kv"].extend(list_kv)
  1116. list_data[-1]["sentences"].extend(sentences)
  1117. _append = True
  1118. if not _append:
  1119. _data = {"type":_type,"tag":obj.get("tag"),"table_id":table_id, "text":_text,"sentences":sentences,"list_table":list_table,
  1120. "line_width":len(_text),"sentence_title":sentence_title,"title_index":title_index,
  1121. "sentence_title_text":sentence_title_text,"sentence_groups":sentence_groups,"parent_title":parent_title,
  1122. "child_title":childs,"title_before":title_before,"title_after":title_after,"title_next":title_next,"next_index":next_index,
  1123. "block":block,"has_product":has_product,
  1124. "sentence_index":sentence_index,"wordOffset_begin":wordOffset_begin,"wordOffset_end":wordOffset_end,
  1125. "kv":list_kv,"position":position
  1126. }
  1127. if sentence_title is not None:
  1128. if len(list_data)>0:
  1129. if self.is_title_first(title_index):
  1130. for i in range(1,len(list_data)+1):
  1131. _d = list_data[-i]
  1132. if _d["sentence_title"] is not None:
  1133. _data["parent_title"] = _d
  1134. _d["child_title"].append(_data)
  1135. break
  1136. else:
  1137. _find = False
  1138. for i in range(1,len(list_data)+1):
  1139. if _find:
  1140. break
  1141. _d = list_data[-i]
  1142. if _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
  1143. if _d["next_index"]==title_index and _d["title_next"] is None and not _d["block"]:
  1144. _data["parent_title"] = _d["parent_title"]
  1145. _d["title_next"] = _data
  1146. if len(_d["child_title"])>0:
  1147. _d["child_title"][-1]["title_next"] = ""
  1148. self.block_tree(_d["child_title"])
  1149. if _d["parent_title"] is not None:
  1150. _d["parent_title"]["child_title"].append(_data)
  1151. _find = True
  1152. break
  1153. for i in range(1,len(list_data)+1):
  1154. if _find:
  1155. break
  1156. _d = list_data[-i]
  1157. if i==1 and not _d["block"] and _d.get("sentence_title")==sentence_title and title_before==_d["title_before"] and title_after==_d["title_after"]:
  1158. _data["parent_title"] = _d["parent_title"]
  1159. _d["title_next"] = _data
  1160. if len(_d["child_title"])>0:
  1161. _d["child_title"][-1]["title_next"] = ""
  1162. self.block_tree(_d["child_title"])
  1163. if _d["parent_title"] is not None:
  1164. _d["parent_title"]["child_title"].append(_data)
  1165. _find = True
  1166. break
  1167. title_before = standard_title_context(title_before)
  1168. title_after = standard_title_context(title_after)
  1169. for i in range(1,len(list_data)+1):
  1170. if _find:
  1171. break
  1172. _d = list_data[-i]
  1173. if _d.get("sentence_title")==sentence_title and title_before==standard_title_context(_d["title_before"]) and title_after==standard_title_context(_d["title_after"]):
  1174. if _d["next_index"]==title_index and _d["title_next"] is None and not _d["block"]:
  1175. _data["parent_title"] = _d["parent_title"]
  1176. _d["title_next"] = _data
  1177. if len(_d["child_title"])>0:
  1178. _d["child_title"][-1]["title_next"] = ""
  1179. self.block_tree(_d["child_title"])
  1180. if _d["parent_title"] is not None:
  1181. _d["parent_title"]["child_title"].append(_data)
  1182. _find = True
  1183. break
  1184. for i in range(1,len(list_data)+1):
  1185. if _find:
  1186. break
  1187. _d = list_data[-i]
  1188. if not _d["block"] and _d.get("sentence_title")==sentence_title and title_before==standard_title_context(_d["title_before"]) and title_after==standard_title_context(_d["title_after"]):
  1189. _data["parent_title"] = _d["parent_title"]
  1190. _d["title_next"] = _data
  1191. if len(_d["child_title"])>0:
  1192. _d["child_title"][-1]["title_next"] = ""
  1193. # self.block_tree(_d["child_title"])
  1194. if _d["parent_title"] is not None:
  1195. _d["parent_title"]["child_title"].append(_data)
  1196. _find = True
  1197. break
  1198. for i in range(1,min(len(list_data)+1,20)):
  1199. if _find:
  1200. break
  1201. _d = list_data[-i]
  1202. if not _d["block"] and _d.get("sentence_title")==sentence_title and title_before==standard_title_context(_d["title_before"]):
  1203. _data["parent_title"] = _d["parent_title"]
  1204. _d["title_next"] = _data
  1205. if len(_d["child_title"])>0:
  1206. _d["child_title"][-1]["title_next"] = ""
  1207. # self.block_tree(_d["child_title"])
  1208. if _d["parent_title"] is not None:
  1209. _d["parent_title"]["child_title"].append(_data)
  1210. _find = True
  1211. break
  1212. if not _find:
  1213. if len(list_data)>0:
  1214. for i in range(1,len(list_data)+1):
  1215. _d = list_data[-i]
  1216. if _d.get("sentence_title") is not None:
  1217. _data["parent_title"] = _d
  1218. _d["child_title"].append(_data)
  1219. break
  1220. else:
  1221. if len(list_data)>0:
  1222. for i in range(1,len(list_data)+1):
  1223. _d = list_data[-i]
  1224. if _d.get("sentence_title") is not None:
  1225. _data["parent_title"] = _d
  1226. _d["child_title"].append(_data)
  1227. break
  1228. list_data.append(_data)
  1229. for _data in list_data:
  1230. childs = _data["child_title"]
  1231. for c_i in range(len(childs)):
  1232. cdata = childs[c_i]
  1233. if cdata["has_product"]:
  1234. continue
  1235. else:
  1236. if c_i>0:
  1237. last_cdata = childs[c_i-1]
  1238. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  1239. cdata["has_product"] = True
  1240. if c_i<len(childs)-1:
  1241. last_cdata = childs[c_i+1]
  1242. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  1243. cdata["has_product"] = True
  1244. for c_i in range(len(childs)):
  1245. cdata = childs[len(childs)-1-c_i]
  1246. if cdata["has_product"]:
  1247. continue
  1248. else:
  1249. if c_i>0:
  1250. last_cdata = childs[c_i-1]
  1251. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  1252. cdata["has_product"] = True
  1253. if c_i<len(childs)-1:
  1254. last_cdata = childs[c_i+1]
  1255. if cdata["sentence_title"] is not None and last_cdata["sentence_title"] is not None and last_cdata["title_before"]==cdata["title_before"] and last_cdata["title_after"]==cdata["title_after"] and last_cdata["has_product"]:
  1256. cdata["has_product"] = True
  1257. return list_data
  1258. def get_tree_sentence(self):
  1259. list_sentence = []
  1260. for obj in self.tree:
  1261. list_sentence.extend(obj.get("sentences",[]))
  1262. return list_sentence
  1263. def extract_kvs_from_table(self,list_pattern,tree=None,result_kv=None):
  1264. if result_kv is None:
  1265. result_kv = [[] for i in list_pattern]
  1266. try:
  1267. for pattern in list_pattern:
  1268. re.compile(pattern)
  1269. except Exception as e:
  1270. log("list_pattern error: "+str(e))
  1271. return result_kv
  1272. if tree is None:
  1273. tree = self.tree
  1274. for obj in tree:
  1275. is_table = True if obj.get("tag","")=="table" else False
  1276. if is_table:
  1277. table_id = obj.get("table_id")
  1278. list_kv = obj.get("kv")
  1279. for _pi in range(len(list_pattern)):
  1280. table_kvs = []
  1281. for _d0 in list_kv:
  1282. _k = _d0.get("key","")
  1283. _v = _d0.get("value","")
  1284. _d = {"key":_k,"value":_v,"position":_d0.get("position",{})}
  1285. if re.search(list_pattern[_pi],_k) is not None:
  1286. table_kvs.append(_d)
  1287. if table_kvs:
  1288. result_kv[_pi].append({"table_id":table_id,"kv":table_kvs})
  1289. childs = obj.get("children",[])
  1290. for child in childs:
  1291. self.extract_kvs_from_table(list_pattern,child,result_kv)
  1292. return result_kv
  1293. def extract_kvs_from_sentence(self,list_pattern,tree=None,result_kv=None):
  1294. if result_kv is None:
  1295. result_kv = [[] for i in list_pattern]
  1296. try:
  1297. for pattern in list_pattern:
  1298. re.compile(pattern)
  1299. except Exception as e:
  1300. log("list_pattern error: "+str(e))
  1301. return result_kv
  1302. if tree is None:
  1303. tree = self.tree
  1304. for obj in tree:
  1305. is_table = True if obj.get("tag","")=="table" else False
  1306. if not is_table:
  1307. list_kv = obj.get("kv",[])
  1308. for _pi in range(len(list_pattern)):
  1309. for _d in list_kv:
  1310. _k = _d.get("key","")
  1311. _v = _d.get("value","")
  1312. if re.search(list_pattern[_pi],_k) is not None:
  1313. result_kv[_pi].append(_d)
  1314. return result_kv
  1315. def extract_kvs_from_outline(self,list_pattern,tree=None,result_kv=None):
  1316. if result_kv is None:
  1317. result_kv = [[] for i in list_pattern]
  1318. try:
  1319. for pattern in list_pattern:
  1320. re.compile(pattern)
  1321. except Exception as e:
  1322. log("list_pattern error: "+str(e))
  1323. return result_kv
  1324. if tree is None:
  1325. tree = self.tree
  1326. for obj in tree:
  1327. is_table = True if obj.get("tag","")=="table" else False
  1328. if not is_table:
  1329. _text = obj["text"]
  1330. for _pi in range(len(list_pattern)):
  1331. sentence_index_from = obj["sentence_index"]
  1332. sentence_index_to = sentence_index_from
  1333. if re.search(list_pattern[_pi],_text) is not None and obj.get("sentence_title") is not None:
  1334. childs = get_childs([obj])
  1335. _child_text = ""
  1336. for _child in childs:
  1337. sentence_index_to = _child["sentence_index"]
  1338. _child_text+=_child["text"]+"\n"
  1339. result_kv[_pi].append({"key":_text,"value":_child_text,"from_outline":True,"key_sentence_index_from":sentence_index_from,
  1340. "key_sentence_index_to":sentence_index_from,"value_sentence_index_from":sentence_index_from,
  1341. "value_sentence_index_to":sentence_index_to,})
  1342. return result_kv
  1343. def extract_kv(self,k_pattern,from_sentence=True,from_outline=True,from_table=True):
  1344. result_kv = []
  1345. try:
  1346. re.compile(k_pattern)
  1347. except Exception as e:
  1348. log("k_pattern error: "+str(e))
  1349. traceback.print_exc()
  1350. return result_kv
  1351. result_kv = []
  1352. if from_table:
  1353. result_kv_table = self.extract_kvs_from_table([k_pattern])
  1354. for table_d in result_kv_table[0]:
  1355. table_id = table_d.get("table_id")
  1356. table_kvs = table_d.get("kv",[])
  1357. for _d in table_kvs:
  1358. _d["from_table"] = True
  1359. result_kv.extend(table_kvs)
  1360. if from_sentence:
  1361. result_kv_sentence = self.extract_kvs_from_sentence([k_pattern])
  1362. for _d in result_kv_sentence[0]:
  1363. _d["from_sentence"] = True
  1364. result_kv.extend(result_kv_sentence[0])
  1365. if from_outline:
  1366. result_kv_outline = self.extract_kvs_from_outline([k_pattern])
  1367. for _d in result_kv_outline[0]:
  1368. _d["from_outline"] = True
  1369. result_kv.extend(result_kv_outline[0])
  1370. return result_kv
  1371. # def extract_kvs_from_table(self,list_pattern):
  1372. if __name__ == '__main__':
  1373. # HTML 文本
  1374. html_content = """
  1375. <div>
  1376. <div>
  1377. <div>
  1378. <div>
  1379. <span>项目名称:</span>
  1380. <span><mark data-markjs="true">广东公司肇庆热力供热管网设计服务项目询价采购</mark></span>
  1381. </div>
  1382. <div>
  1383. <span>采购机构:</span>
  1384. <span><a target="_blank" class="markBlue" href="/bdqyhx/340219751287832576.html" style="color: #3083EB !important;text-decoration: underline;">国能物资南方有限公司</a></span>
  1385. </div>
  1386. <div>
  1387. <span>采购编号:</span>
  1388. <span>NFSB-FWXJ-2024110471</span>
  1389. </div>
  1390. <div>
  1391. <span>采购人:</span>
  1392. <span><a target="_blank" class="markBlue" href="/bdqyhx/213048615245266944.html" style="color: #3083EB !important;text-decoration: underline;">肇庆大旺电力热力有限公司</a></span>
  1393. </div>
  1394. <div>
  1395. <span>报价人资格条件:</span>
  1396. <span>报价人资质要求:报价人须同时满足以下资质证书 1. 工程设计资质证书-市政-市政行业资质乙级 或 工程设计资质证书-电力-电力行业资质乙级 2. 特种设备生产许可证-压力管道-公用管道GB2。报价人业绩要求:报价人须满足以下业绩 1. 报价人须提供近五年内(2019年1月1日至报价截止日期) 管道设计(长度大于3公里) 合同 至少 2 个, 报价人须提供符合本采购要求的业绩合同扫描件,必须包含采购范围、合同签订时间、甲乙方盖章页, 报价人须同时提供业绩合同对应的其他证明文件: 结算发票(开票时间2019-01-01至2024-11-30) ,未按上述要求提供的业绩证明文件为无效证明文件。</span>
  1397. </div>
  1398. <div>
  1399. <span>采购方式:</span>
  1400. <span> 询价采购</span>
  1401. </div>
  1402. <div>
  1403. <span>询价方式:</span>
  1404. <span> 公开询价</span>
  1405. </div>
  1406. <div>
  1407. <span>物资分类:</span>
  1408. <span> 火电设备-&gt;热控系统设备及配件;服务-&gt;其它;燃机设备-&gt;锅炉设备;火电设备-&gt;除灰设备及配件;服务-&gt;综合服务;火电设备-&gt;汽机辅机设备及配件;服务-&gt;勘察设计;火电设备-&gt;脱硫设备及配件;火电设备-&gt;锅炉主机设备及配件;燃机设备-&gt;燃气轮机设备及配件;</span>
  1409. </div>
  1410. <div>
  1411. <span>主要技术要求:</span>
  1412. <span> 本设计服务采购范围包括但不限于: 1.新增或改造供热管网开口项目编制建设项目可行性研究报告、设计方案、勘察设计、初步设计、施工图设计文件、非标准设备设计文件、施工图预算文件等服务。 2.报价人自行组织现场踏勘,采购人可提供供热管网母管参数、用汽用户需求参数等资料,报价人根据相关资料提供项目设计方案,内容应包含项目概况、热负荷计算、管网敷设方案、工程投资估算等。</span>
  1413. </div>
  1414. </div>
  1415. <div>
  1416. <div>
  1417. <div>
  1418. <span>发布人:</span>
  1419. <span> 任灏洋</span>
  1420. </div>
  1421. <div>
  1422. <span>报价方式:</span>
  1423. <span>整单</span>
  1424. </div>
  1425. </div>
  1426. <div>
  1427. <div>
  1428. <span>联系电话:</span>
  1429. </div>
  1430. <div>
  1431. <span>发布时间:</span>
  1432. <span>2024-11-29 16:43:56</span>
  1433. </div>
  1434. </div>
  1435. <div>
  1436. <div>
  1437. <span>服务时间:</span>
  1438. <span>合同签订后730天内 </span>
  1439. </div>
  1440. <div>
  1441. <span>报价截止时间:</span>
  1442. <span> 2024-12-04 09:00:00</span>
  1443. </div>
  1444. </div>
  1445. <div>
  1446. <div>
  1447. <span>支付方式:</span>
  1448. <span> 电汇</span>
  1449. </div>
  1450. <div>
  1451. <span>运费承接:</span>
  1452. <span> 供应方承担</span>
  1453. </div>
  1454. </div>
  1455. <div>
  1456. <div>
  1457. <span>服务地点:</span>
  1458. <span><a target="_blank" class="markBlue" href="/bdqyhx/213048615245266944.html" style="color: #3083EB !important;text-decoration: underline;">肇庆大旺电力热力有限公司</a>物资工厂</span>
  1459. </div>
  1460. <div>
  1461. <span>异议联系人:</span>
  1462. <span> 杨帆</span>
  1463. </div>
  1464. </div>
  1465. </div>
  1466. <div>
  1467. <div>
  1468. <div>
  1469. <div>
  1470. 附件:
  1471. </div>
  1472. <div>
  1473. <p> 有 </p>
  1474. </div>
  1475. </div>
  1476. <div>
  1477. <span>异议接收单位:</span>
  1478. <span> <a target="_blank" class="markBlue" href="/bdqyhx/340219751287832576.html" style="color: #3083EB !important;text-decoration: underline;">国能物资南方有限公司</a></span>
  1479. </div>
  1480. </div>
  1481. </div>
  1482. <div>
  1483. <div>
  1484. <div>
  1485. <span>备注:</span>
  1486. </div>
  1487. </div>
  1488. </div>
  1489. <div>
  1490. <div>
  1491. <span>发布平台:</span>
  1492. <span>国家能源e购(网址:www.neep.shop),报价人须在发布平台注册、经审核通过并缴纳供应商分类年费后才能参与具体项目报价。</span>
  1493. </div>
  1494. </div>
  1495. </div>
  1496. </div>
  1497. """
  1498. _tree = html_to_tree(html_content)
  1499. _pd = Html2KVTree(html_content)
  1500. _pd.print_tree(_pd.tree,"-|")
  1501. list_kv = _pd.extract_kv("资格条件")
  1502. print(list_kv)
  1503. #获取预处理后的所有句子,该句子与kv值对应
  1504. print(_pd.get_tree_sentence())
  1505. # soup = BeautifulSoup(html_content,"lxml")
  1506. # table_tree = table_to_tree(soup)
  1507. # print(json.dumps(table_tree,ensure_ascii=False))