''' Created on 2019年3月25日 @author: User ''' import glob import re import copy from bs4 import BeautifulSoup import codecs import pandas as pd from BiddingKG.dl.interface.predictor import * from BiddingKG.dl.form.feature import * import psycopg2 from BiddingKG.dl.common.Utils import * # formPredictor = FormPredictor() def tableToText(soup,data,file,data_set_is,data_set_no): ''' @param: soup:网页html的soup @return:处理完表格信息的网页text ''' def getTrs(tbody): #获取所有的tr trs = [] objs = tbody.find_all(recursive=False) for obj in objs: if obj.name=="tr": trs.append(obj) if obj.name=="tbody": for tr in obj.find_all("tr",recursive=False): trs.append(tr) return trs def fixSpan(tbody): # 处理colspan, rowspan信息补全问题 #trs = tbody.findChildren('tr', recursive=False) trs = getTrs(tbody) ths_len = 0 ths = list() trs_set = set() #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱 # 遍历每一个tr for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) #不补全含有表格的tr if len(tr.findChildren('table'))>0: continue if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) for indtd, td in enumerate(tds): # 若有colspan 则补全同一行下一个位置 if 'colspan' in td.attrs: if str(re.sub("[^0-9]","",str(td['colspan'])))!="": col = int(re.sub("[^0-9]","",str(td['colspan']))) td['colspan'] = 1 for i in range(1, col, 1): td.insert_after(copy.copy(td)) for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) #不补全含有表格的tr if len(tr.findChildren('table'))>0: continue if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) for indtd, td in enumerate(tds): # 若有rowspan 则补全下一行同样位置 if 'rowspan' in td.attrs: if str(re.sub("[^0-9]","",str(td['rowspan'])))!="": row = int(re.sub("[^0-9]","",str(td['rowspan']))) td['rowspan'] = 1 for i in range(1, row, 1): # 获取下一行的所有td, 在对应的位置插入 if indtr+i= (indtd) and len(tds1)>0: if indtd > 0: tds1[indtd - 1].insert_after(copy.copy(td)) else: tds1[0].insert_before(copy.copy(td)) def getTable(tbody): #trs = tbody.findChildren('tr', recursive=False) trs = getTrs(tbody) inner_table = [] for tr in trs: tr_line = [] tds = tr.findChildren(['td','th'], recursive=False) for td in tds: tr_line.append([re.sub('\s*','',td.get_text()),0]) inner_table.append(tr_line) return inner_table #处理表格不对齐的问题 def fixTable(inner_table): maxWidth = 0 for item in inner_table: if len(item)>maxWidth: maxWidth = len(item) for i in range(len(inner_table)): if len(inner_table[i])prob_min: is_head = True else: is_head = False #print(temp_item,form_prob) if len(inner_table[i][0][0])>40: is_long_value = True if is_head or is_long_value or is_same_value: #不把连续表头分开 if not is_head_last: head_list.append(i) if is_long_value or is_same_value: head_list.append(i+1) if is_head: for j in range(width): if inner_table[i][j][0] not in data_set_is and inner_table[i][j][0] not in data_set_no: data.append([file,inner_table[i][j][0],1]) data_set_is.add(inner_table[i][j][0]) inner_table[i][j][1] = 1 is_head_last = is_head head_list.append(height) #列表头 for i in range(len(head_list)-1): head_begin = head_list[i] head_end = head_list[i+1] #最后一列不设置为列表头 for i in range(width-1): is_head = False #predict is head or not with model temp_item = "" for j in range(head_begin,head_end): temp_item += inner_table[j][i][0]+"|" temp_item = re.sub(pad_pattern,"",temp_item) form_prob = formPredictor.predict(encoding(temp_item,expand=True)) if form_prob is not None: if form_prob[0][1]>prob_min: is_head = True else: is_head = False if is_head: for j in range(head_begin,head_end): if inner_table[j][i][0] not in data_set_is and inner_table[j][i][0] not in data_set_no: data.append([file,inner_table[j][i][0],1]) data_set_is.add(inner_table[j][i][0]) inner_table[j][i][1] = 2 for line in inner_table: for item in line: if item[0] not in data_set_is and item[0] not in data_set_no: data.append([file,item[0],0]) data_set_no.add(item[0]) addPadding(inner_table, pad_row, pad_col) return inner_table,head_list #设置表头 def setHead_withRule(inner_table,pattern,pat_value,count): height = len(inner_table) width = len(inner_table[0]) head_list = [] head_list.append(0) #行表头 is_head_last = False for i in range(height): set_match = set() is_head = False is_long_value = False is_same_value = True same_value = inner_table[i][0][0] for j in range(width): if inner_table[i][j][0]!=same_value: is_same_value = False break for j in range(width): if re.search(pat_value,inner_table[i][j][0]) is not None: is_head = False break str_find = re.findall(pattern,inner_table[i][j][0]) if len(str_find)>0: set_match.add(inner_table[i][j][0]) if len(set_match)>=count: is_head = True if len(inner_table[i][0][0])>40: is_long_value = True if is_head or is_long_value or is_same_value: if not is_head_last: head_list.append(i) if is_head: for j in range(width): inner_table[i][j][1] = 1 is_head_last = is_head head_list.append(height) #列表头 for i in range(len(head_list)-1): head_begin = head_list[i] head_end = head_list[i+1] #最后一列不设置为列表头 for i in range(width-1): set_match = set() is_head = False for j in range(head_begin,head_end): if re.search(pat_value,inner_table[j][i][0]) is not None: is_head = False break str_find = re.findall(pattern,inner_table[j][i][0]) if len(str_find)>0: set_match.add(inner_table[j][i][0]) if len(set_match)>=count: is_head = True if is_head: for j in range(head_begin,head_end): inner_table[j][i][1] = 2 return inner_table,head_list #取得表格的处理方向 def getDirect(inner_table,begin,end): column_head = set() row_head = set() widths = len(inner_table[0]) for height in range(begin,end): for width in range(widths): if inner_table[height][width][1] ==1: row_head.add(height) if inner_table[height][width][1] ==2: column_head.add(width) company_pattern = re.compile("公司") if 0 in column_head and begin not in row_head: return "column" if 0 in column_head and begin in row_head: for height in range(begin,end): count = 0 count_flag = True for width_index in range(width): if inner_table[height][width_index][1]==0: if re.search(company_pattern,inner_table[height][width_index][0]) is not None: count += 1 else: count_flag = False if count_flag and count>=2: return "column" return "row" #根据表格处理方向生成句子, def getTableText(inner_table,head_list): rankPattern = "(排名|排序|名次|评标结果|评审结果)" entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)" height = len(inner_table) width = len(inner_table[0]) text = "" for head_i in range(len(head_list)-1): head_begin = head_list[head_i] head_end = head_list[head_i+1] direct = getDirect(inner_table, head_begin, head_end) if direct=="row": for i in range(head_begin,head_end): rank_text = "" entity_text = "" text_line = "" #在同一句话中重复的可以去掉 text_set = set() for j in range(width): cell = inner_table[i][j] #是属性值 if cell[1]==0: find_flag = False head = "" temp_head = "" for loop_j in range(1,j+1): if inner_table[i][j-loop_j][1]==2: if find_flag: if inner_table[i][j-loop_j][0]!=temp_head: head = inner_table[i][j-loop_j][0]+":"+head else: head = inner_table[i][j-loop_j][0]+":"+head find_flag = True temp_head = inner_table[i][j-loop_j][0] else: if find_flag: break find_flag = False temp_head = "" for loop_i in range(0,i+1-head_begin): if inner_table[i-loop_i][j][1]==1: if find_flag: if inner_table[i-loop_i][j][0]!=temp_head: head = inner_table[i-loop_i][j][0]+":"+head else: head = inner_table[i-loop_i][j][0]+":"+head find_flag = True temp_head = inner_table[i-loop_i][j][0] else: #找到表头后遇到属性值就返回 if find_flag: break if str(head+inner_table[i][j][0]) in text_set: continue if re.search(rankPattern,head) is not None: rank_text += head+inner_table[i][j][0]+"," #print(rank_text) elif re.search(entityPattern,head) is not None: entity_text += head+inner_table[i][j][0]+"," #print(entity_text) else: text_line += head+inner_table[i][j][0]+"," text_set.add(str(head+inner_table[i][j][0])) text += rank_text+entity_text+text_line text = text[:-1]+"。" else: for j in range(width): rank_text = "" entity_text = "" text_line = "" text_set = set() for i in range(head_begin,head_end): cell = inner_table[i][j] #是属性值 if cell[1]==0: find_flag = False head = "" temp_head = "" for loop_j in range(1,j+1): if inner_table[i][j-loop_j][1]==2: if find_flag: if inner_table[i][j-loop_j][0]!=temp_head: head = inner_table[i][j-loop_j][0]+":"+head else: head = inner_table[i][j-loop_j][0]+":"+head find_flag = True temp_head = inner_table[i][j-loop_j][0] else: if find_flag: break find_flag = False temp_head = "" for loop_i in range(0,i+1-head_begin): if inner_table[i-loop_i][j][1]==1: if find_flag: if inner_table[i-loop_i][j][0]!=temp_head: head = inner_table[i-loop_i][j][0]+":"+head else: head = inner_table[i-loop_i][j][0]+":"+head find_flag = True temp_head = inner_table[i-loop_i][j][0] else: if find_flag: break if str(head+inner_table[i][j][0]) in text_set: continue if re.search(rankPattern,head) is not None: rank_text += head+inner_table[i][j][0]+"," #print(rank_text) elif re.search(entityPattern,head) is not None: entity_text += head+inner_table[i][j][0]+"," #print(entity_text) else: text_line += head+inner_table[i][j][0]+"," text_set.add(str(head+inner_table[i][j][0])) text += rank_text+entity_text+text_line text = text[:-1]+"。" return text def trunTable(tbody): fixSpan(tbody) inner_table = getTable(tbody) inner_table = fixTable(inner_table) if len(inner_table)>0 and len(inner_table[0])>0: #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3) inner_table,head_list = setHead(inner_table) ''' print("----") print(head_list) for item in inner_table: print(item) ''' tbody.string = getTableText(inner_table,head_list) #print(tbody.string) tbody.name = "table" pat_head = re.compile('(名称|序号|项目|标项|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理|制造)') #pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)') pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)") tbodies = soup.find_all('table') # 遍历表格中的每个tbody #逆序处理嵌套表格 for tbody_index in range(1,len(tbodies)+1): tbody = tbodies[len(tbodies)-tbody_index] trunTable(tbody) tbodies = soup.find_all('tbody') # 遍历表格中的每个tbody #逆序处理嵌套表格 for tbody_index in range(1,len(tbodies)+1): tbody = tbodies[len(tbodies)-tbody_index] trunTable(tbody) return soup def getSourceData(): data = [] data_set_is = set() data_set_no = set() for file in glob.glob("C:\\Users\\User\\Desktop\\20190320要素\\*.html"): filename = file.split("\\")[-1] source = codecs.open(file,"r",encoding="utf8").read() tableToText(BeautifulSoup(source,"lxml"),data,filename,data_set_is,data_set_no) for file in glob.glob("C:\\Users\\User\\Desktop\\20190306要素\\*.html"): filename = file.split("\\")[-1] source = codecs.open(file,"r",encoding="utf8").read() tableToText(BeautifulSoup(source,"lxml"),data,filename,data_set_is,data_set_no) '''''' list_file = [] list_item = [] list_label = [] #data.sort(key=lambda x:x[2],reverse=True) data = data[0:60000] for item in data: list_file.append(item[0]) list_item.append(item[1][:100]) list_label.append(item[2]) df = pd.DataFrame({"list_file":list_file,"list_item":list_item,"list_label":list_label}) df.to_excel("data_item.xls",columns=["list_file","list_item","list_label"]) def importData(): conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() file = "data_item.xls" df = pd.read_excel(file) for file,text,label in zip(df["list_file"],df["list_item"],df["list_label"]): text = str(text) text = text.replace("\\","\\\\") text = re.sub("'","\\'",str(text)) sql = " insert into form(filename,text,label) values(E'"+file+"',E'"+str(text)+"',E'"+str(int(label))+"')" print(sql) cursor.execute(sql) conn.commit() conn.close() def selectWithRule(source,filter,target): assert source!=target dict_source = pd.read_excel(source) set_filter = set() for filt in filter: set_filter = set_filter | set(pd.read_excel(filt)["list_item"]) list_file = [] list_item = [] list_label = [] for file,text,label in zip(dict_source["list_file"],dict_source["list_item"],dict_source["list_label"]): if str(text) in set_filter: continue if re.search(".{8,}(工程|项目|采购|公告|公示)",str(text)) is not None: #if len(str(text))>20: list_file.append(file) list_item.append(text) list_label.append(label) data = {"list_file":list_file,"list_item":list_item,"list_label":list_label} columns = ["list_file","list_item","list_label"] df = pd.DataFrame(data) df.to_excel(target,index=False,columns=columns) def importRelabel(): files = ["批量.xls"] conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() for file in files: df = pd.read_excel(file) for text,relabel in zip(df["list_item"],df["list_relabel"]): text = str(text) text = text.replace("\\","\\\\") text = re.sub("'","\\'",str(text)) sql = " update form set relabel='"+str(int(relabel))+"' where text=E'"+str(text)+"' " cursor.execute(sql) conn.commit() conn.close() def getHtml(): conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = " select filename from form where relabel is NULL group by filename having count(1)>0 " cursor.execute(sql) rows = cursor.fetchall() data = [] index = 0 for row in rows: filename = row[0] if filename=="比地_101_58519594.html": print(index) path = "C:\\Users\\User\\Desktop\\20190320要素\\"+filename if not os.path.exists(path): path = "C:\\Users\\User\\Desktop\\20190306要素\\"+filename data.append([filename,codecs.open(path,'r',encoding="utf8").read()]) index += 1 #save(data,"namehtml.pk") def getTrainData(percent=0.9): conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = "select filename,text,label,relabel,handlabel from form " cursor.execute(sql) rows = cursor.fetchall() save(rows,"filename_text_label_relabel_handlabel.pk") train_x = [] train_y = [] test_x = [] test_y = [] test_text = [] for row in rows: input = str(row[1]) label = str(int(row[2])) if row[4] is not None: label = str(int(row[4])) elif row[3] is not None: label = str(int(row[3])) item_y = [0,0] item_y[int(label)] = 1 if np.random.random()