|
- import pickle
- import re
- import copy
- import codecs
- from bs4 import BeautifulSoup
- import glob
- def save(object_to_save, path):
- '''
- 保存对象
- @Arugs:
- object_to_save: 需要保存的对象
- @Return:
- 保存的路径
- '''
- with open(path, 'wb') as f:
- pickle.dump(object_to_save, f)
- def load(path):
- '''
- 读取对象
- @Arugs:
- path: 读取的路径
- @Return:
- 读取的对象
- '''
- with open(path, 'rb') as f:
- object1 = pickle.load(f)
- return object1
- def tableToText(soup):
- '''
- @param:
- soup:网页html的soup
- @return:处理完表格信息的网页text
- '''
- def fixSpan(tbody):
- # 处理colspan, rowspan信息补全问题
- trs = tbody.findChildren('tr', recursive=False)
- ths_len = 0
- ths = list()
- trs_set = set()
- #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
- # 遍历每一个tr
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- #不补全含有表格的tr
- if len(tr.findChildren('table'))>0:
- continue
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- for indtd, td in enumerate(tds):
- # 若有colspan 则补全同一行下一个位置
- if 'colspan' in td.attrs:
- if str(re.sub("[^0-9]","",str(td['colspan'])))!="":
- col = int(re.sub("[^0-9]","",str(td['colspan'])))
- td['colspan'] = 1
- for i in range(1, col, 1):
- td.insert_after(copy.copy(td))
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- #不补全含有表格的tr
- if len(tr.findChildren('table'))>0:
- continue
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- for indtd, td in enumerate(tds):
- # 若有rowspan 则补全下一行同样位置
- if 'rowspan' in td.attrs:
- if str(re.sub("[^0-9]","",str(td['rowspan'])))!="":
- row = int(re.sub("[^0-9]","",str(td['rowspan'])))
- td['rowspan'] = 1
- for i in range(1, row, 1):
- # 获取下一行的所有td, 在对应的位置插入
- if indtr+i<len(trs):
- tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
- if len(tds1) >= (indtd) and len(tds1)>0:
- if indtd > 0:
- tds1[indtd - 1].insert_after(copy.copy(td))
- else:
- tds1[0].insert_before(copy.copy(td))
- def getTable(tbody):
- trs = tbody.findChildren('tr', recursive=False)
- inner_table = []
- for tr in trs:
- tr_line = []
- tds = tr.findChildren(['td','th'], recursive=False)
- for td in tds:
- tr_line.append([re.sub('\s*','',td.get_text()),0])
- inner_table.append(tr_line)
- return inner_table
-
- #处理表格不对齐的问题
- def fixTable(inner_table):
- maxWidth = 0
- for item in inner_table:
- if len(item)>maxWidth:
- maxWidth = len(item)
- for i in range(len(inner_table)):
- if len(inner_table[i])<maxWidth:
- for j in range(maxWidth-len(inner_table[i])):
- inner_table[i].append(["",0])
- return inner_table
-
- #设置表头
- def setHead(inner_table,pattern,pat_value,count):
- height = len(inner_table)
- width = len(inner_table[0])
- head_list = []
- head_list.append(0)
- #行表头
- is_head_last = False
- for i in range(height):
- set_match = set()
- is_head = False
- is_long_value = False
- is_same_value = True
- same_value = inner_table[i][0][0]
- for j in range(width):
- if inner_table[i][j][0]!=same_value:
- is_same_value = False
- break
- for j in range(width):
- if re.search(pat_value,inner_table[i][j][0]) is not None:
- is_head = False
- break
- str_find = re.findall(pattern,inner_table[i][j][0])
- if len(str_find)>0:
- set_match.add(inner_table[i][j][0])
- if len(set_match)>=count:
- is_head = True
- if len(inner_table[i][0][0])>40:
- is_long_value = True
- if is_head or is_long_value or is_same_value:
- if not is_head_last:
- head_list.append(i)
- if is_head:
- for j in range(width):
- inner_table[i][j][1] = 1
- is_head_last = is_head
- head_list.append(height)
- #列表头
- for i in range(len(head_list)-1):
- head_begin = head_list[i]
- head_end = head_list[i+1]
- #最后一列不设置为列表头
- for i in range(width-1):
- set_match = set()
- is_head = False
- for j in range(head_begin,head_end):
- if re.search(pat_value,inner_table[j][i][0]) is not None:
- is_head = False
- break
- str_find = re.findall(pattern,inner_table[j][i][0])
- if len(str_find)>0:
- set_match.add(inner_table[j][i][0])
- if len(set_match)>=count:
- is_head = True
- if is_head:
- for j in range(head_begin,head_end):
- inner_table[j][i][1] = 2
- return inner_table,head_list
-
- #取得表格的处理方向
- def getDirect(inner_table,begin,end):
- column_head = set()
- row_head = set()
- widths = len(inner_table[0])
- for height in range(begin,end):
- for width in range(widths):
- if inner_table[height][width][1] ==1:
- row_head.add(height)
- if inner_table[height][width][1] ==2:
- column_head.add(width)
- company_pattern = re.compile("公司")
- if 0 in column_head and begin not in row_head:
- return "column"
- if 0 in column_head and begin in row_head:
- for height in range(begin,end):
- count = 0
- count_flag = True
- for width_index in range(width):
- if inner_table[height][width_index][1]==0:
- if re.search(company_pattern,inner_table[height][width_index][0]) is not None:
- count += 1
- else:
- count_flag = False
- if count_flag and count>=2:
- return "column"
- return "row"
-
- #根据表格处理方向生成句子,
- def getTableText(inner_table,head_list):
- rankPattern = "(排名|排序|名次|评标结果|评审结果)"
- entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
- height = len(inner_table)
- width = len(inner_table[0])
- text = ""
-
- for head_i in range(len(head_list)-1):
-
- head_begin = head_list[head_i]
- head_end = head_list[head_i+1]
-
- direct = getDirect(inner_table, head_begin, head_end)
- if direct=="row":
-
- for i in range(head_begin,head_end):
- rank_text = ""
- entity_text = ""
- text_line = ""
- for j in range(width):
- cell = inner_table[i][j]
- #是属性值
- if cell[1]==0:
- find_flag = False
- head = ""
- temp_head = ""
- text_set = set()
- for loop_j in range(1,j+1):
- if inner_table[i][j-loop_j][1]==2:
- if find_flag:
- if inner_table[i][j-loop_j][0]!=temp_head:
- head = inner_table[i][j-loop_j][0]+":"+head
- else:
- head = inner_table[i][j-loop_j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i][j-loop_j][0]
- else:
- if find_flag:
- break
- find_flag = False
- temp_head = ""
- for loop_i in range(0,i+1-head_begin):
- if inner_table[i-loop_i][j][1]==1:
- if find_flag:
- if inner_table[i-loop_i][j][0]!=temp_head:
- head = inner_table[i-loop_i][j][0]+":"+head
- else:
- head = inner_table[i-loop_i][j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i-loop_i][j][0]
- else:
- if find_flag:
- break
- if str(head+inner_table[i][j][0]) in text_set:
- continue
- if re.search(rankPattern,head) is not None:
- rank_text += head+inner_table[i][j][0]+","
- #print(rank_text)
- elif re.search(entityPattern,head) is not None:
- entity_text += head+inner_table[i][j][0]+","
- #print(entity_text)
- else:
- text_line += head+inner_table[i][j][0]+","
- text_set.add(str(head+inner_table[i][j][0]))
- text += rank_text+entity_text+text_line
- text = text[:-1]+"。"
- else:
- for j in range(width):
-
- rank_text = ""
- entity_text = ""
- text_line = ""
- for i in range(head_begin,head_end):
- cell = inner_table[i][j]
- #是属性值
- if cell[1]==0:
- find_flag = False
- head = ""
- temp_head = ""
- text_set = set()
- for loop_j in range(1,j+1):
- if inner_table[i][j-loop_j][1]==2:
- if find_flag:
- if inner_table[i][j-loop_j][0]!=temp_head:
- head = inner_table[i][j-loop_j][0]+":"+head
- else:
- head = inner_table[i][j-loop_j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i][j-loop_j][0]
- else:
- if find_flag:
- break
- find_flag = False
- temp_head = ""
- for loop_i in range(0,i+1-head_begin):
- if inner_table[i-loop_i][j][1]==1:
- if find_flag:
- if inner_table[i-loop_i][j][0]!=temp_head:
- head = inner_table[i-loop_i][j][0]+":"+head
- else:
- head = inner_table[i-loop_i][j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i-loop_i][j][0]
- else:
- if find_flag:
- break
- if str(head+inner_table[i][j][0]) in text_set:
- continue
- if re.search(rankPattern,head) is not None:
- rank_text += head+inner_table[i][j][0]+","
- #print(rank_text)
- elif re.search(entityPattern,head) is not None:
- entity_text += head+inner_table[i][j][0]+","
- #print(entity_text)
- else:
- text_line += head+inner_table[i][j][0]+","
- text_set.add(str(head+inner_table[i][j][0]))
- text += rank_text+entity_text+text_line
- text = text[:-1]+"。"
- return text
-
- pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
- #pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
- pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
- tbodies = soup.find_all('tbody')
- if len(tbodies) == 0:
- tbodies = soup.find_all('table')
- # 遍历表格中的每个tbody
- #逆序处理嵌套表格
- for tbody_index in range(1,len(tbodies)+1):
- tbody = tbodies[len(tbodies)-tbody_index]
- fixSpan(tbody)
- inner_table = getTable(tbody)
- inner_table = fixTable(inner_table)
- if len(inner_table)>0 and len(inner_table[0])>0:
- inner_table,head_list = setHead(inner_table,pat_head,pat_value,3)
- tbody.string = getTableText(inner_table,head_list)
- #print(tbody.string)
- tbody.name = "table"
- return soup
- def getText(article):
- soup = BeautifulSoup(article,"lxml")
- soup = tableToText(soup)
- return soup.get_text()
- if __name__=="__main__":
-
- home = "C:\\Users\\User\\Desktop\\20190416要素\\*.html"
- data = []
- for file in glob.glob(home):
- article = codecs.open(file,"r",encoding="utf8").read()
- #text = getText(article)
- '''
- if len(text)<400:
- print(file.split("\\")[-1])
- continue
- '''
- data.append([file.split("\\")[-1],article])
- save(data,"data.pk")
- print("length:",len(data))
- '''
- data = load("data_zb.pk")
- print(len(data))
- a = set()
- index = 0
- for item in data:
- index += 1
- if item[0]=="比地_52_57160814.html":
- print("index",index)
- break
- '''
-
-
-
-
|