123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506 |
- #!/usr/bin/env python
- #encoding:utf-8
- from deepdive import *
- from commonutil import *
- from bs4 import BeautifulSoup, Comment
- import copy
- import re
- import os
- os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
- def table2text(soup):
- '''
- 把公告中的表格转化为纯文本
- Args:
- soup: beautifulsoup实例
- Returns:
- 处理过后的beautifulsoup实例
- '''
- tbodies = soup.find_all('tbody')
- if len(tbodies) == 0:
- tbodies = soup.find_all('table')
- # 遍历表格中的每个tbody
- for tbody in tbodies:
- # 处理colspan, rowspan信息补全问题
- trs = tbody.findChildren('tr', recursive=False)
- ths_len = 0
- ths = list()
- trs_set = set()
- # 遍历每一个tr
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- if len(tds) > 1:
- for indtd, td in enumerate(tds):
- # 若有rowspan 则补全下一行同样位置
- if 'rowspan' in td.attrs:
- if str(td['rowspan'])!="":
- #print(td)
- row = int(re.sub("[^0-9]","",str(td['rowspan'])))
- td['rowspan'] = 1
- for i in range(1, row, 1):
- # 获取下一行的所有td, 在对应的位置插入
- if indtr+i<len(trs):
- tds1 = trs[indtr + i].findChildren('td', recursive=False)
- if len(tds1)==0:
- tds1 = trs[indtr + i].findChildren('th', recursive=False)
- if len(tds1) >= (indtd) and len(tds1)>0:
- if indtd > 0:
- tds1[indtd - 1].insert_after(copy.copy(td))
- else:
- tds1[0].insert_before(copy.copy(td))
- # 若有colspan 则补全同一行下一个位置
- if 'colspan' in td.attrs:
- if str(td['colspan'])!="":
- #print(re.sub("[^0-9]","",td['colspan']))
- col = int(re.sub("[^0-9]","",str(td['colspan'])))
- td['colspan'] = 1
- for i in range(1, col, 1):
- td.insert_after(copy.copy(td))
- # 表格转化成文字
- if ths_len > 1: # 有表头的表格
- if len(trs_set) == 1: # 横状表格
- ps = ''
- trs_set = tbody.findChildren('tr', recursive=False)
- for i in range(1, len(trs_set), 1):
- tr = trs_set[i]
- tds = tr.findChildren('td', recursive=False)
- p = ''
- for ind, th in enumerate(ths):
- if ind < len(tds):
- p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
- p = p + ";"
- ps = ps + p
- tbody.string = ps
- tbody.name = 'div'
- else: # 竖状表格
- ps = ''
- tds = list(trs_set)[0].findChildren('td', recursive=False)
- for ind, td in enumerate(tds):
- p = ''
- for i in range(0, len(trs_set), 1):
- tds_temp = list(trs_set)[i].findChildren('td', recursive=False)
- if ind < len(tds_temp):
- if ind < len(tds_temp):
- p = p + ths[i].get_text() + tds_temp[ind].get_text() + ";"
- ps = ps + p
- tbody.string = ps
- tbody.name = 'p'
- else: # 有表头但是非th标签的横状表格
- trs = tbody.findChildren('tr', recursive=False)
- if len(trs) > 0:
- tds0 = trs[0].findChildren('td', recursive=False)
- if len(tds0) > 2:
- tds_str = [td.get_text() for td in tds0]
- pat = re.compile('(序号|项目|产品|货物|单位|数量|价格|金额|总价|中标|供应商|候选|编号|得分|名次|排名|排序|科室){1}')
- match_counts = re.subn(pat, '', ";".join(tds_str))[1]
- if match_counts > 2:
- ths = []
- for td in trs[0].findChildren('td', recursive=False):
- td.name = 'th'
- ths.append(td)
- ps = ''
- trs = tbody.findChildren('tr', recursive=False)
- for i in range(1, len(trs), 1):
- tr = trs[i]
- tds = tr.findChildren('td', recursive=False)
- p = ''
- for ind, th in enumerate(ths):
- if (len(tds)-1) >= ind:
- p = p + th.get_text() + ":" + tds[ind].get_text() + ";"
- p = p + ";"
- ps = ps + p
- tbody.string = ps
- tbody.name = 'p'
-
-
-
- return soup
- def tableToText(soup):
- def fixSpan(tbody):
- # 处理colspan, rowspan信息补全问题
- trs = tbody.findChildren('tr', recursive=False)
- ths_len = 0
- ths = list()
- trs_set = set()
- #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
- # 遍历每一个tr
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- #不补全含有表格的tr
- if len(tr.findChildren('table'))>0:
- continue
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- for indtd, td in enumerate(tds):
- # 若有colspan 则补全同一行下一个位置
- if 'colspan' in td.attrs:
- if str(td['colspan'])!="":
- col = int(re.sub("[^0-9]","",str(td['colspan'])))
- td['colspan'] = 1
- for i in range(1, col, 1):
- td.insert_after(copy.copy(td))
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- #不补全含有表格的tr
- if len(tr.findChildren('table'))>0:
- continue
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- for indtd, td in enumerate(tds):
- # 若有rowspan 则补全下一行同样位置
- if 'rowspan' in td.attrs:
- if str(td['rowspan'])!="":
- row = int(re.sub("[^0-9]","",str(td['rowspan'])))
- td['rowspan'] = 1
- for i in range(1, row, 1):
- # 获取下一行的所有td, 在对应的位置插入
- if indtr+i<len(trs):
- tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
- if len(tds1) >= (indtd) and len(tds1)>0:
- if indtd > 0:
- tds1[indtd - 1].insert_after(copy.copy(td))
- else:
- tds1[0].insert_before(copy.copy(td))
- def getTable(tbody):
- trs = tbody.findChildren('tr', recursive=False)
- inner_table = []
- for tr in trs:
- tr_line = []
- tds = tr.findChildren(['td','th'], recursive=False)
- for td in tds:
- tr_line.append([re.sub('\s*','',td.get_text()),0])
- inner_table.append(tr_line)
- return inner_table
-
- #处理表格不对齐的问题
- def fixTable(inner_table):
- maxWidth = 0
- for item in inner_table:
- if len(item)>maxWidth:
- maxWidth = len(item)
- for i in range(len(inner_table)):
- if len(inner_table[i])<maxWidth:
- for j in range(maxWidth-len(inner_table[i])):
- inner_table[i].append(["",0])
- return inner_table
-
- #设置表头
- def setHead(inner_table,pattern,pat_value,count):
- height = len(inner_table)
- width = len(inner_table[0])
- head_list = []
- head_list.append(0)
- #行表头
- for i in range(height):
- set_match = set()
- is_head = False
- for j in range(width):
- if re.search(pat_value,inner_table[i][j][0]) is not None:
- is_head = False
- break
- str_find = re.findall(pattern,inner_table[i][j][0])
- if len(str_find)>0:
- set_match.add(inner_table[i][j][0])
- if len(set_match)>=count:
- is_head = True
- if is_head:
- head_list.append(i)
- for j in range(width):
- inner_table[i][j][1] = 1
- head_list.append(height)
- #列表头
- for i in range(len(head_list)-1):
- head_begin = head_list[i]
- head_end = head_list[i+1]
- #最后一列不设置为列表头
- for i in range(width-1):
- set_match = set()
- is_head = False
- for j in range(head_begin,head_end):
- if re.search(pat_value,inner_table[j][i][0]) is not None:
- is_head = False
- break
- str_find = re.findall(pattern,inner_table[j][i][0])
- if len(str_find)>0:
- set_match.add(inner_table[j][i][0])
- if len(set_match)>=count:
- is_head = True
- if is_head:
- for j in range(head_begin,head_end):
- inner_table[j][i][1] = 2
- return inner_table,head_list
-
- def getDirect(inner_table,begin,end):
- column_head = set()
- row_head = set()
- widths = len(inner_table[0])
- for height in range(begin,end):
- for width in range(widths):
- if inner_table[height][width][1] ==1:
- row_head.add(height)
- if inner_table[height][width][1] ==2:
- column_head.add(width)
- company_pattern = re.compile("公司")
- if 0 in column_head and begin not in row_head:
- return "column"
- if 0 in column_head and begin in row_head:
- for height in range(begin,end):
- count = 0
- count_flag = True
- for width in range(width):
- if inner_table[height][width][1]==0:
- if re.search(company_pattern,inner_table[height][width][0]) is not None:
- count += 1
- else:
- count_flag = False
- if count_flag and count>=2:
- return "column"
- return "row"
-
-
- def getTableText(inner_table,head_list):
- rankPattern = "(排名|排序|名次|评标结果)"
- entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
- height = len(inner_table)
- width = len(inner_table[0])
- text = ""
-
- for head_i in range(len(head_list)-1):
- text_set = set()
- head_begin = head_list[head_i]
- head_end = head_list[head_i+1]
-
- direct = getDirect(inner_table, head_begin, head_end)
- if direct=="row":
-
- for i in range(head_begin,head_end):
- rank_text = ""
- entity_text = ""
- text_line = ""
- for j in range(width):
- cell = inner_table[i][j]
- #是属性值
- if cell[1]==0:
- find_flag = False
- head = ""
- temp_head = ""
- for loop_j in range(1,j+1):
- if inner_table[i][j-loop_j][1]==2:
- if find_flag:
- if inner_table[i][j-loop_j][0]!=temp_head:
- head = inner_table[i][j-loop_j][0]+":"+head
- else:
- head = inner_table[i][j-loop_j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i][j-loop_j][0]
- else:
- if find_flag:
- break
- find_flag = False
- temp_head = ""
- for loop_i in range(1,i+1):
- if inner_table[i-loop_i][j][1]==1:
- if find_flag:
- if inner_table[i-loop_i][j][0]!=temp_head:
- head = inner_table[i-loop_i][j][0]+":"+head
- else:
- head = inner_table[i-loop_i][j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i-loop_i][j][0]
- else:
- if find_flag:
- break
- if str(head+inner_table[i][j][0]) in text_set:
- continue
- if re.search(rankPattern,head) is not None:
- rank_text += head+inner_table[i][j][0]+","
- #print(rank_text)
- elif re.search(entityPattern,head) is not None:
- entity_text += head+inner_table[i][j][0]+","
- #print(entity_text)
- else:
- text_line += head+inner_table[i][j][0]+","
- text_set.add(str(head+inner_table[i][j][0]))
- text += rank_text+entity_text+text_line
- text = text[:-1]+"。"
- else:
- for j in range(width):
-
- rank_text = ""
- entity_text = ""
- text_line = ""
- for i in range(head_begin,head_end):
- cell = inner_table[i][j]
- #是属性值
- if cell[1]==0:
- find_flag = False
- head = ""
- temp_head = ""
- for loop_j in range(1,j+1):
- if inner_table[i][j-loop_j][1]==2:
- if find_flag:
- if inner_table[i][j-loop_j][0]!=temp_head:
- head = inner_table[i][j-loop_j][0]+":"+head
- else:
- head = inner_table[i][j-loop_j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i][j-loop_j][0]
- else:
- if find_flag:
- break
- find_flag = False
- temp_head = ""
- for loop_i in range(1,i+1):
- if inner_table[i-loop_i][j][1]==1:
- if find_flag:
- if inner_table[i-loop_i][j][0]!=temp_head:
- head = inner_table[i-loop_i][j][0]+":"+head
- else:
- head = inner_table[i-loop_i][j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i-loop_i][j][0]
- else:
- if find_flag:
- break
- if str(head+inner_table[i][j][0]) in text_set:
- continue
- if re.search(rankPattern,head) is not None:
- rank_text += head+inner_table[i][j][0]+","
- #print(rank_text)
- elif re.search(entityPattern,head) is not None:
- entity_text += head+inner_table[i][j][0]+","
- #print(entity_text)
- else:
- text_line += head+inner_table[i][j][0]+","
- text_set.add(str(head+inner_table[i][j][0]))
- text += rank_text+entity_text+text_line
- text = text[:-1]+"。"
- return text
-
- pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
- pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
- tbodies = soup.find_all('tbody')
- if len(tbodies) == 0:
- tbodies = soup.find_all('table')
- # 遍历表格中的每个tbody
- #逆序处理嵌套表格
- for tbody_index in range(1,len(tbodies)+1):
- tbody = tbodies[len(tbodies)-tbody_index]
- fixSpan(tbody)
- inner_table = getTable(tbody)
- inner_table = fixTable(inner_table)
- if len(inner_table)>0:
- inner_table,head_list = setHead(inner_table,pat_head,pat_value,3)
- tbody.string = getTableText(inner_table,head_list)
- #print(tbody.string)
- tbody.name = "table"
- return soup
- def segment(soup):
- #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
- segList = ["tr"]
- #commaList = ["p","div","br","td","span"]
- commaList = []
- spaceList = ["span"]
- subspaceList = ["td",'a',"span"]
- tbodies = soup.find_all('tbody')
- if len(tbodies) == 0:
- tbodies = soup.find_all('table')
- # 递归遍历所有节点,插入符号
- for child in soup.body.descendants:
- if child.name in segList:
- child.insert_after("。")
- if child.name in commaList:
- child.insert_after(",")
- if child.name in subspaceList:
- child.insert_before("#subs"+str(child.name)+"#")
- child.insert_after("#sube"+str(child.name)+"#")
- if child.name in spaceList:
- child.insert_after(" ")
- text = str(soup.get_text())
-
- #替换"""为"“",否则导入deepdive出错
- text = text.replace('"',"“").replace("\r","").replace("\n","")
-
- #替换英文冒号为中文冒号
- text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
- #替换为中文逗号
- text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
- #替换为中文分号
- text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
-
- #删除标签中的所有空格
- for subs in subspaceList:
- patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
- while(True):
- oneMatch = re.search(re.compile(patten),text)
- if oneMatch is not None:
- text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s","",oneMatch.group(1)))
- else:
- break
- #替换标点
- while(True):
- #替换连续的标点
- punc = re.search(",(?P<punc>:|。|,|;)\s*",text)
- if punc is not None:
- text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text)
-
- punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
- if punc is not None:
- text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
- else:
- #替换标点之后的空格
- punc = re.search("(?P<punc>:|。|,|;)\s+",text)
- if punc is not None:
- text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
- else:
- break
- #将连续的中文句号替换为一个
- text_split = text.split("。")
- text_split = [x for x in text_split if len(x)>0]
- text = "。".join(text_split)
- #删除所有空格
- text = re.sub("\s*","",text)
- return text
- @tsv_extractor
- @returns(lambda
- doc_id = "text",
- content ="text",
- :[])
- def extract(
- doc_id = "text",
- content ="text",
- ):
- log("doc_id="+str(doc_id))
- #content_new = segment(table2text(BeautifulSoup(content,"lxml")))
- content_new = segment(tableToText(BeautifulSoup(content,"lxml")))
- if len(content_new)<=20000:
- yield[
- doc_id,
- content_new,]
|