#!/usr/bin/env python #encoding:utf-8 from deepdive import * from commonutil import * from bs4 import BeautifulSoup, Comment import copy import re import os os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8' def table2text(soup): ''' 把公告中的表格转化为纯文本 Args: soup: beautifulsoup实例 Returns: 处理过后的beautifulsoup实例 ''' tbodies = soup.find_all('tbody') if len(tbodies) == 0: tbodies = soup.find_all('table') # 遍历表格中的每个tbody for tbody in tbodies: # 处理colspan, rowspan信息补全问题 trs = tbody.findChildren('tr', recursive=False) ths_len = 0 ths = list() trs_set = set() # 遍历每一个tr for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) if len(tds) > 1: for indtd, td in enumerate(tds): # 若有rowspan 则补全下一行同样位置 if 'rowspan' in td.attrs: if str(td['rowspan'])!="": #print(td) row = int(re.sub("[^0-9]","",str(td['rowspan']))) td['rowspan'] = 1 for i in range(1, row, 1): # 获取下一行的所有td, 在对应的位置插入 if indtr+i= (indtd) and len(tds1)>0: if indtd > 0: tds1[indtd - 1].insert_after(copy.copy(td)) else: tds1[0].insert_before(copy.copy(td)) # 若有colspan 则补全同一行下一个位置 if 'colspan' in td.attrs: if str(td['colspan'])!="": #print(re.sub("[^0-9]","",td['colspan'])) col = int(re.sub("[^0-9]","",str(td['colspan']))) td['colspan'] = 1 for i in range(1, col, 1): td.insert_after(copy.copy(td)) # 表格转化成文字 if ths_len > 1: # 有表头的表格 if len(trs_set) == 1: # 横状表格 ps = '' trs_set = tbody.findChildren('tr', recursive=False) for i in range(1, len(trs_set), 1): tr = trs_set[i] tds = tr.findChildren('td', recursive=False) p = '' for ind, th in enumerate(ths): if ind < len(tds): p = p + th.get_text() + ":" + tds[ind].get_text() + ";" p = p + ";" ps = ps + p tbody.string = ps tbody.name = 'div' else: # 竖状表格 ps = '' tds = list(trs_set)[0].findChildren('td', recursive=False) for ind, td in enumerate(tds): p = '' for i in range(0, len(trs_set), 1): tds_temp = list(trs_set)[i].findChildren('td', recursive=False) if ind < len(tds_temp): if ind < len(tds_temp): p = p + ths[i].get_text() + tds_temp[ind].get_text() + ";" ps = ps + p tbody.string = ps tbody.name = 'p' else: # 有表头但是非th标签的横状表格 trs = tbody.findChildren('tr', recursive=False) if len(trs) > 0: tds0 = trs[0].findChildren('td', recursive=False) if len(tds0) > 2: tds_str = [td.get_text() for td in tds0] pat = re.compile('(序号|项目|产品|货物|单位|数量|价格|金额|总价|中标|供应商|候选|编号|得分|名次|排名|排序|科室){1}') match_counts = re.subn(pat, '', ";".join(tds_str))[1] if match_counts > 2: ths = [] for td in trs[0].findChildren('td', recursive=False): td.name = 'th' ths.append(td) ps = '' trs = tbody.findChildren('tr', recursive=False) for i in range(1, len(trs), 1): tr = trs[i] tds = tr.findChildren('td', recursive=False) p = '' for ind, th in enumerate(ths): if (len(tds)-1) >= ind: p = p + th.get_text() + ":" + tds[ind].get_text() + ";" p = p + ";" ps = ps + p tbody.string = ps tbody.name = 'p' return soup def tableToText(soup): def fixSpan(tbody): # 处理colspan, rowspan信息补全问题 trs = tbody.findChildren('tr', recursive=False) ths_len = 0 ths = list() trs_set = set() #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱 # 遍历每一个tr for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) #不补全含有表格的tr if len(tr.findChildren('table'))>0: continue if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) for indtd, td in enumerate(tds): # 若有colspan 则补全同一行下一个位置 if 'colspan' in td.attrs: if str(td['colspan'])!="": col = int(re.sub("[^0-9]","",str(td['colspan']))) td['colspan'] = 1 for i in range(1, col, 1): td.insert_after(copy.copy(td)) for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) #不补全含有表格的tr if len(tr.findChildren('table'))>0: continue if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) for indtd, td in enumerate(tds): # 若有rowspan 则补全下一行同样位置 if 'rowspan' in td.attrs: if str(td['rowspan'])!="": row = int(re.sub("[^0-9]","",str(td['rowspan']))) td['rowspan'] = 1 for i in range(1, row, 1): # 获取下一行的所有td, 在对应的位置插入 if indtr+i= (indtd) and len(tds1)>0: if indtd > 0: tds1[indtd - 1].insert_after(copy.copy(td)) else: tds1[0].insert_before(copy.copy(td)) def getTable(tbody): trs = tbody.findChildren('tr', recursive=False) inner_table = [] for tr in trs: tr_line = [] tds = tr.findChildren(['td','th'], recursive=False) for td in tds: tr_line.append([re.sub('\s*','',td.get_text()),0]) inner_table.append(tr_line) return inner_table #处理表格不对齐的问题 def fixTable(inner_table): maxWidth = 0 for item in inner_table: if len(item)>maxWidth: maxWidth = len(item) for i in range(len(inner_table)): if len(inner_table[i])0: set_match.add(inner_table[i][j][0]) if len(set_match)>=count: is_head = True if is_head: head_list.append(i) for j in range(width): inner_table[i][j][1] = 1 head_list.append(height) #列表头 for i in range(len(head_list)-1): head_begin = head_list[i] head_end = head_list[i+1] #最后一列不设置为列表头 for i in range(width-1): set_match = set() is_head = False for j in range(head_begin,head_end): if re.search(pat_value,inner_table[j][i][0]) is not None: is_head = False break str_find = re.findall(pattern,inner_table[j][i][0]) if len(str_find)>0: set_match.add(inner_table[j][i][0]) if len(set_match)>=count: is_head = True if is_head: for j in range(head_begin,head_end): inner_table[j][i][1] = 2 return inner_table,head_list def getDirect(inner_table,begin,end): column_head = set() row_head = set() widths = len(inner_table[0]) for height in range(begin,end): for width in range(widths): if inner_table[height][width][1] ==1: row_head.add(height) if inner_table[height][width][1] ==2: column_head.add(width) company_pattern = re.compile("公司") if 0 in column_head and begin not in row_head: return "column" if 0 in column_head and begin in row_head: for height in range(begin,end): count = 0 count_flag = True for width in range(width): if inner_table[height][width][1]==0: if re.search(company_pattern,inner_table[height][width][0]) is not None: count += 1 else: count_flag = False if count_flag and count>=2: return "column" return "row" def getTableText(inner_table,head_list): rankPattern = "(排名|排序|名次|评标结果)" entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)" height = len(inner_table) width = len(inner_table[0]) text = "" for head_i in range(len(head_list)-1): text_set = set() head_begin = head_list[head_i] head_end = head_list[head_i+1] direct = getDirect(inner_table, head_begin, head_end) if direct=="row": for i in range(head_begin,head_end): rank_text = "" entity_text = "" text_line = "" for j in range(width): cell = inner_table[i][j] #是属性值 if cell[1]==0: find_flag = False head = "" temp_head = "" for loop_j in range(1,j+1): if inner_table[i][j-loop_j][1]==2: if find_flag: if inner_table[i][j-loop_j][0]!=temp_head: head = inner_table[i][j-loop_j][0]+":"+head else: head = inner_table[i][j-loop_j][0]+":"+head find_flag = True temp_head = inner_table[i][j-loop_j][0] else: if find_flag: break find_flag = False temp_head = "" for loop_i in range(1,i+1): if inner_table[i-loop_i][j][1]==1: if find_flag: if inner_table[i-loop_i][j][0]!=temp_head: head = inner_table[i-loop_i][j][0]+":"+head else: head = inner_table[i-loop_i][j][0]+":"+head find_flag = True temp_head = inner_table[i-loop_i][j][0] else: if find_flag: break if str(head+inner_table[i][j][0]) in text_set: continue if re.search(rankPattern,head) is not None: rank_text += head+inner_table[i][j][0]+"," #print(rank_text) elif re.search(entityPattern,head) is not None: entity_text += head+inner_table[i][j][0]+"," #print(entity_text) else: text_line += head+inner_table[i][j][0]+"," text_set.add(str(head+inner_table[i][j][0])) text += rank_text+entity_text+text_line text = text[:-1]+"。" else: for j in range(width): rank_text = "" entity_text = "" text_line = "" for i in range(head_begin,head_end): cell = inner_table[i][j] #是属性值 if cell[1]==0: find_flag = False head = "" temp_head = "" for loop_j in range(1,j+1): if inner_table[i][j-loop_j][1]==2: if find_flag: if inner_table[i][j-loop_j][0]!=temp_head: head = inner_table[i][j-loop_j][0]+":"+head else: head = inner_table[i][j-loop_j][0]+":"+head find_flag = True temp_head = inner_table[i][j-loop_j][0] else: if find_flag: break find_flag = False temp_head = "" for loop_i in range(1,i+1): if inner_table[i-loop_i][j][1]==1: if find_flag: if inner_table[i-loop_i][j][0]!=temp_head: head = inner_table[i-loop_i][j][0]+":"+head else: head = inner_table[i-loop_i][j][0]+":"+head find_flag = True temp_head = inner_table[i-loop_i][j][0] else: if find_flag: break if str(head+inner_table[i][j][0]) in text_set: continue if re.search(rankPattern,head) is not None: rank_text += head+inner_table[i][j][0]+"," #print(rank_text) elif re.search(entityPattern,head) is not None: entity_text += head+inner_table[i][j][0]+"," #print(entity_text) else: text_line += head+inner_table[i][j][0]+"," text_set.add(str(head+inner_table[i][j][0])) text += rank_text+entity_text+text_line text = text[:-1]+"。" return text pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)') pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)") tbodies = soup.find_all('tbody') if len(tbodies) == 0: tbodies = soup.find_all('table') # 遍历表格中的每个tbody #逆序处理嵌套表格 for tbody_index in range(1,len(tbodies)+1): tbody = tbodies[len(tbodies)-tbody_index] fixSpan(tbody) inner_table = getTable(tbody) inner_table = fixTable(inner_table) if len(inner_table)>0: inner_table,head_list = setHead(inner_table,pat_head,pat_value,3) tbody.string = getTableText(inner_table,head_list) #print(tbody.string) tbody.name = "table" return soup def segment(soup): #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"] segList = ["tr"] #commaList = ["p","div","br","td","span"] commaList = [] spaceList = ["span"] subspaceList = ["td",'a',"span"] tbodies = soup.find_all('tbody') if len(tbodies) == 0: tbodies = soup.find_all('table') # 递归遍历所有节点,插入符号 for child in soup.body.descendants: if child.name in segList: child.insert_after("。") if child.name in commaList: child.insert_after(",") if child.name in subspaceList: child.insert_before("#subs"+str(child.name)+"#") child.insert_after("#sube"+str(child.name)+"#") if child.name in spaceList: child.insert_after(" ") text = str(soup.get_text()) #替换"""为"“",否则导入deepdive出错 text = text.replace('"',"“").replace("\r","").replace("\n","") #替换英文冒号为中文冒号 text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text) #替换为中文逗号 text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text) #替换为中文分号 text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text) #删除标签中的所有空格 for subs in subspaceList: patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#" while(True): oneMatch = re.search(re.compile(patten),text) if oneMatch is not None: text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s","",oneMatch.group(1))) else: break #替换标点 while(True): #替换连续的标点 punc = re.search(",(?P:|。|,|;)\s*",text) if punc is not None: text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text) punc = re.search("(?P:|。|,|;)\s*,",text) if punc is not None: text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text) else: #替换标点之后的空格 punc = re.search("(?P:|。|,|;)\s+",text) if punc is not None: text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text) else: break #将连续的中文句号替换为一个 text_split = text.split("。") text_split = [x for x in text_split if len(x)>0] text = "。".join(text_split) #删除所有空格 text = re.sub("\s*","",text) return text @tsv_extractor @returns(lambda doc_id = "text", content ="text", :[]) def extract( doc_id = "text", content ="text", ): log("doc_id="+str(doc_id)) #content_new = segment(table2text(BeautifulSoup(content,"lxml"))) content_new = segment(tableToText(BeautifulSoup(content,"lxml"))) if len(content_new)<=20000: yield[ doc_id, content_new,]