|
- '''
- Created on 2019年3月25日
- @author: User
- '''
- import glob
- import re
- import copy
- from bs4 import BeautifulSoup
- import codecs
- import pandas as pd
- from BiddingKG.dl.interface.predictor import *
- from BiddingKG.dl.form.feature import *
- import psycopg2
- from BiddingKG.dl.common.Utils import *
- # formPredictor = FormPredictor()
- def tableToText(soup,data,file,data_set_is,data_set_no):
- '''
- @param:
- soup:网页html的soup
- @return:处理完表格信息的网页text
- '''
-
- def getTrs(tbody):
- #获取所有的tr
- trs = []
- objs = tbody.find_all(recursive=False)
- for obj in objs:
- if obj.name=="tr":
- trs.append(obj)
- if obj.name=="tbody":
- for tr in obj.find_all("tr",recursive=False):
- trs.append(tr)
- return trs
-
- def fixSpan(tbody):
- # 处理colspan, rowspan信息补全问题
- #trs = tbody.findChildren('tr', recursive=False)
- trs = getTrs(tbody)
- ths_len = 0
- ths = list()
- trs_set = set()
- #修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
- # 遍历每一个tr
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- #不补全含有表格的tr
- if len(tr.findChildren('table'))>0:
- continue
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- for indtd, td in enumerate(tds):
- # 若有colspan 则补全同一行下一个位置
- if 'colspan' in td.attrs:
- if str(re.sub("[^0-9]","",str(td['colspan'])))!="":
- col = int(re.sub("[^0-9]","",str(td['colspan'])))
- td['colspan'] = 1
- for i in range(1, col, 1):
- td.insert_after(copy.copy(td))
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- #不补全含有表格的tr
- if len(tr.findChildren('table'))>0:
- continue
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- for indtd, td in enumerate(tds):
- # 若有rowspan 则补全下一行同样位置
- if 'rowspan' in td.attrs:
- if str(re.sub("[^0-9]","",str(td['rowspan'])))!="":
- row = int(re.sub("[^0-9]","",str(td['rowspan'])))
- td['rowspan'] = 1
- for i in range(1, row, 1):
- # 获取下一行的所有td, 在对应的位置插入
- if indtr+i<len(trs):
- tds1 = trs[indtr + i].findChildren(['td','th'], recursive=False)
- if len(tds1) >= (indtd) and len(tds1)>0:
- if indtd > 0:
- tds1[indtd - 1].insert_after(copy.copy(td))
- else:
- tds1[0].insert_before(copy.copy(td))
- def getTable(tbody):
- #trs = tbody.findChildren('tr', recursive=False)
- trs = getTrs(tbody)
- inner_table = []
- for tr in trs:
- tr_line = []
- tds = tr.findChildren(['td','th'], recursive=False)
- for td in tds:
- tr_line.append([re.sub('\s*','',td.get_text()),0])
- inner_table.append(tr_line)
- return inner_table
-
- #处理表格不对齐的问题
- def fixTable(inner_table):
- maxWidth = 0
- for item in inner_table:
- if len(item)>maxWidth:
- maxWidth = len(item)
- for i in range(len(inner_table)):
- if len(inner_table[i])<maxWidth:
- for j in range(maxWidth-len(inner_table[i])):
- inner_table[i].append(["",0])
- return inner_table
-
- def removePadding(inner_table,pad_row = "@@",pad_col = "##"):
- height = len(inner_table)
- width = len(inner_table[0])
- for i in range(height):
- point = ""
- for j in range(width):
- if inner_table[i][j][0]==point and point!="":
- inner_table[i][j][0] = pad_row
- else:
- if inner_table[i][j][0] not in [pad_row,pad_col]:
- point = inner_table[i][j][0]
- for j in range(width):
- point = ""
- for i in range(height):
- if inner_table[i][j][0]==point and point!="":
- inner_table[i][j][0] = pad_col
- else:
- if inner_table[i][j][0] not in [pad_row,pad_col]:
- point = inner_table[i][j][0]
-
- def addPadding(inner_table,pad_row = "@@",pad_col = "##"):
- height = len(inner_table)
- width = len(inner_table[0])
- for i in range(height):
- for j in range(width):
- if inner_table[i][j][0]==pad_row:
- inner_table[i][j][0] = inner_table[i][j-1][0]
- inner_table[i][j][1] = inner_table[i][j-1][1]
- if inner_table[i][j][0]==pad_col:
- inner_table[i][j][0] = inner_table[i-1][j][0]
- inner_table[i][j][1] = inner_table[i-1][j][1]
-
- #设置表头
- def setHead(inner_table,prob_min=0.64):
- pad_row = "@@"
- pad_col = "##"
- removePadding(inner_table, pad_row, pad_col)
- pad_pattern = re.compile(pad_row+"|"+pad_col)
- height = len(inner_table)
- width = len(inner_table[0])
- head_list = []
- head_list.append(0)
- #行表头
- is_head_last = False
- for i in range(height):
-
- is_head = False
- is_long_value = False
-
- #判断是否是全padding值
- is_same_value = True
- same_value = inner_table[i][0][0]
- for j in range(width):
- if inner_table[i][j][0]!=same_value and inner_table[i][j][0]!=pad_row:
- is_same_value = False
- break
-
- #predict is head or not with model
- temp_item = ""
- for j in range(width):
- temp_item += inner_table[i][j][0]+"|"
- temp_item = re.sub(pad_pattern,"",temp_item)
- form_prob = formPredictor.predict(encoding(temp_item,expand=True))
- if form_prob is not None:
- if form_prob[0][1]>prob_min:
- is_head = True
- else:
- is_head = False
-
- #print(temp_item,form_prob)
- if len(inner_table[i][0][0])>40:
- is_long_value = True
- if is_head or is_long_value or is_same_value:
- #不把连续表头分开
- if not is_head_last:
- head_list.append(i)
- if is_long_value or is_same_value:
- head_list.append(i+1)
- if is_head:
- for j in range(width):
- if inner_table[i][j][0] not in data_set_is and inner_table[i][j][0] not in data_set_no:
- data.append([file,inner_table[i][j][0],1])
- data_set_is.add(inner_table[i][j][0])
- inner_table[i][j][1] = 1
-
- is_head_last = is_head
- head_list.append(height)
- #列表头
- for i in range(len(head_list)-1):
- head_begin = head_list[i]
- head_end = head_list[i+1]
- #最后一列不设置为列表头
- for i in range(width-1):
- is_head = False
-
- #predict is head or not with model
- temp_item = ""
- for j in range(head_begin,head_end):
- temp_item += inner_table[j][i][0]+"|"
- temp_item = re.sub(pad_pattern,"",temp_item)
- form_prob = formPredictor.predict(encoding(temp_item,expand=True))
- if form_prob is not None:
- if form_prob[0][1]>prob_min:
- is_head = True
- else:
- is_head = False
-
- if is_head:
- for j in range(head_begin,head_end):
- if inner_table[j][i][0] not in data_set_is and inner_table[j][i][0] not in data_set_no:
- data.append([file,inner_table[j][i][0],1])
- data_set_is.add(inner_table[j][i][0])
- inner_table[j][i][1] = 2
- for line in inner_table:
- for item in line:
- if item[0] not in data_set_is and item[0] not in data_set_no:
- data.append([file,item[0],0])
- data_set_no.add(item[0])
-
- addPadding(inner_table, pad_row, pad_col)
- return inner_table,head_list
-
- #设置表头
- def setHead_withRule(inner_table,pattern,pat_value,count):
- height = len(inner_table)
- width = len(inner_table[0])
- head_list = []
- head_list.append(0)
- #行表头
- is_head_last = False
- for i in range(height):
- set_match = set()
- is_head = False
- is_long_value = False
- is_same_value = True
- same_value = inner_table[i][0][0]
- for j in range(width):
- if inner_table[i][j][0]!=same_value:
- is_same_value = False
- break
- for j in range(width):
- if re.search(pat_value,inner_table[i][j][0]) is not None:
- is_head = False
- break
- str_find = re.findall(pattern,inner_table[i][j][0])
- if len(str_find)>0:
- set_match.add(inner_table[i][j][0])
- if len(set_match)>=count:
- is_head = True
- if len(inner_table[i][0][0])>40:
- is_long_value = True
- if is_head or is_long_value or is_same_value:
- if not is_head_last:
- head_list.append(i)
- if is_head:
- for j in range(width):
- inner_table[i][j][1] = 1
- is_head_last = is_head
- head_list.append(height)
- #列表头
- for i in range(len(head_list)-1):
- head_begin = head_list[i]
- head_end = head_list[i+1]
- #最后一列不设置为列表头
- for i in range(width-1):
- set_match = set()
- is_head = False
- for j in range(head_begin,head_end):
- if re.search(pat_value,inner_table[j][i][0]) is not None:
- is_head = False
- break
- str_find = re.findall(pattern,inner_table[j][i][0])
- if len(str_find)>0:
- set_match.add(inner_table[j][i][0])
- if len(set_match)>=count:
- is_head = True
- if is_head:
- for j in range(head_begin,head_end):
- inner_table[j][i][1] = 2
- return inner_table,head_list
-
- #取得表格的处理方向
- def getDirect(inner_table,begin,end):
- column_head = set()
- row_head = set()
- widths = len(inner_table[0])
- for height in range(begin,end):
- for width in range(widths):
- if inner_table[height][width][1] ==1:
- row_head.add(height)
- if inner_table[height][width][1] ==2:
- column_head.add(width)
- company_pattern = re.compile("公司")
- if 0 in column_head and begin not in row_head:
- return "column"
- if 0 in column_head and begin in row_head:
- for height in range(begin,end):
- count = 0
- count_flag = True
- for width_index in range(width):
- if inner_table[height][width_index][1]==0:
- if re.search(company_pattern,inner_table[height][width_index][0]) is not None:
- count += 1
- else:
- count_flag = False
- if count_flag and count>=2:
- return "column"
- return "row"
-
- #根据表格处理方向生成句子,
- def getTableText(inner_table,head_list):
- rankPattern = "(排名|排序|名次|评标结果|评审结果)"
- entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
- height = len(inner_table)
- width = len(inner_table[0])
- text = ""
-
- for head_i in range(len(head_list)-1):
-
- head_begin = head_list[head_i]
- head_end = head_list[head_i+1]
-
- direct = getDirect(inner_table, head_begin, head_end)
- if direct=="row":
-
- for i in range(head_begin,head_end):
- rank_text = ""
- entity_text = ""
- text_line = ""
- #在同一句话中重复的可以去掉
- text_set = set()
- for j in range(width):
- cell = inner_table[i][j]
- #是属性值
- if cell[1]==0:
- find_flag = False
- head = ""
- temp_head = ""
- for loop_j in range(1,j+1):
- if inner_table[i][j-loop_j][1]==2:
- if find_flag:
- if inner_table[i][j-loop_j][0]!=temp_head:
- head = inner_table[i][j-loop_j][0]+":"+head
- else:
- head = inner_table[i][j-loop_j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i][j-loop_j][0]
- else:
- if find_flag:
- break
- find_flag = False
- temp_head = ""
- for loop_i in range(0,i+1-head_begin):
- if inner_table[i-loop_i][j][1]==1:
- if find_flag:
- if inner_table[i-loop_i][j][0]!=temp_head:
- head = inner_table[i-loop_i][j][0]+":"+head
- else:
- head = inner_table[i-loop_i][j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i-loop_i][j][0]
- else:
- #找到表头后遇到属性值就返回
- if find_flag:
- break
- if str(head+inner_table[i][j][0]) in text_set:
- continue
- if re.search(rankPattern,head) is not None:
- rank_text += head+inner_table[i][j][0]+","
- #print(rank_text)
- elif re.search(entityPattern,head) is not None:
- entity_text += head+inner_table[i][j][0]+","
- #print(entity_text)
- else:
- text_line += head+inner_table[i][j][0]+","
- text_set.add(str(head+inner_table[i][j][0]))
- text += rank_text+entity_text+text_line
- text = text[:-1]+"。"
- else:
- for j in range(width):
-
- rank_text = ""
- entity_text = ""
- text_line = ""
- text_set = set()
- for i in range(head_begin,head_end):
- cell = inner_table[i][j]
- #是属性值
- if cell[1]==0:
- find_flag = False
- head = ""
- temp_head = ""
-
- for loop_j in range(1,j+1):
- if inner_table[i][j-loop_j][1]==2:
- if find_flag:
- if inner_table[i][j-loop_j][0]!=temp_head:
- head = inner_table[i][j-loop_j][0]+":"+head
- else:
- head = inner_table[i][j-loop_j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i][j-loop_j][0]
- else:
- if find_flag:
- break
- find_flag = False
- temp_head = ""
- for loop_i in range(0,i+1-head_begin):
- if inner_table[i-loop_i][j][1]==1:
- if find_flag:
- if inner_table[i-loop_i][j][0]!=temp_head:
- head = inner_table[i-loop_i][j][0]+":"+head
- else:
- head = inner_table[i-loop_i][j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i-loop_i][j][0]
- else:
- if find_flag:
- break
- if str(head+inner_table[i][j][0]) in text_set:
- continue
- if re.search(rankPattern,head) is not None:
- rank_text += head+inner_table[i][j][0]+","
- #print(rank_text)
- elif re.search(entityPattern,head) is not None:
- entity_text += head+inner_table[i][j][0]+","
- #print(entity_text)
- else:
- text_line += head+inner_table[i][j][0]+","
- text_set.add(str(head+inner_table[i][j][0]))
- text += rank_text+entity_text+text_line
- text = text[:-1]+"。"
- return text
-
- def trunTable(tbody):
- fixSpan(tbody)
- inner_table = getTable(tbody)
- inner_table = fixTable(inner_table)
- if len(inner_table)>0 and len(inner_table[0])>0:
- #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
- inner_table,head_list = setHead(inner_table)
-
- '''
- print("----")
- print(head_list)
- for item in inner_table:
- print(item)
- '''
-
- tbody.string = getTableText(inner_table,head_list)
- #print(tbody.string)
- tbody.name = "table"
-
- pat_head = re.compile('(名称|序号|项目|标项|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理|制造)')
- #pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
- pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
-
- tbodies = soup.find_all('table')
- # 遍历表格中的每个tbody
- #逆序处理嵌套表格
- for tbody_index in range(1,len(tbodies)+1):
- tbody = tbodies[len(tbodies)-tbody_index]
- trunTable(tbody)
-
- tbodies = soup.find_all('tbody')
- # 遍历表格中的每个tbody
- #逆序处理嵌套表格
- for tbody_index in range(1,len(tbodies)+1):
- tbody = tbodies[len(tbodies)-tbody_index]
- trunTable(tbody)
- return soup
- def getSourceData():
- data = []
- data_set_is = set()
- data_set_no = set()
- for file in glob.glob("C:\\Users\\User\\Desktop\\20190320要素\\*.html"):
- filename = file.split("\\")[-1]
- source = codecs.open(file,"r",encoding="utf8").read()
- tableToText(BeautifulSoup(source,"lxml"),data,filename,data_set_is,data_set_no)
- for file in glob.glob("C:\\Users\\User\\Desktop\\20190306要素\\*.html"):
- filename = file.split("\\")[-1]
- source = codecs.open(file,"r",encoding="utf8").read()
- tableToText(BeautifulSoup(source,"lxml"),data,filename,data_set_is,data_set_no)
- ''''''
- list_file = []
- list_item = []
- list_label = []
- #data.sort(key=lambda x:x[2],reverse=True)
- data = data[0:60000]
- for item in data:
- list_file.append(item[0])
- list_item.append(item[1][:100])
- list_label.append(item[2])
- df = pd.DataFrame({"list_file":list_file,"list_item":list_item,"list_label":list_label})
- df.to_excel("data_item.xls",columns=["list_file","list_item","list_label"])
- def importData():
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- file = "data_item.xls"
- df = pd.read_excel(file)
- for file,text,label in zip(df["list_file"],df["list_item"],df["list_label"]):
- text = str(text)
- text = text.replace("\\","\\\\")
- text = re.sub("'","\\'",str(text))
- sql = " insert into form(filename,text,label) values(E'"+file+"',E'"+str(text)+"',E'"+str(int(label))+"')"
- print(sql)
- cursor.execute(sql)
- conn.commit()
- conn.close()
- def selectWithRule(source,filter,target):
- assert source!=target
- dict_source = pd.read_excel(source)
- set_filter = set()
- for filt in filter:
- set_filter = set_filter | set(pd.read_excel(filt)["list_item"])
-
- list_file = []
- list_item = []
- list_label = []
-
- for file,text,label in zip(dict_source["list_file"],dict_source["list_item"],dict_source["list_label"]):
- if str(text) in set_filter:
- continue
- if re.search(".{8,}(工程|项目|采购|公告|公示)",str(text)) is not None:
- #if len(str(text))>20:
- list_file.append(file)
- list_item.append(text)
- list_label.append(label)
-
-
- data = {"list_file":list_file,"list_item":list_item,"list_label":list_label}
- columns = ["list_file","list_item","list_label"]
-
- df = pd.DataFrame(data)
- df.to_excel(target,index=False,columns=columns)
-
- def importRelabel():
- files = ["批量.xls"]
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- for file in files:
- df = pd.read_excel(file)
- for text,relabel in zip(df["list_item"],df["list_relabel"]):
- text = str(text)
- text = text.replace("\\","\\\\")
- text = re.sub("'","\\'",str(text))
- sql = " update form set relabel='"+str(int(relabel))+"' where text=E'"+str(text)+"' "
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- def getHtml():
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- sql = " select filename from form where relabel is NULL group by filename having count(1)>0 "
- cursor.execute(sql)
- rows = cursor.fetchall()
- data = []
- index = 0
- for row in rows:
- filename = row[0]
- if filename=="比地_101_58519594.html":
- print(index)
- path = "C:\\Users\\User\\Desktop\\20190320要素\\"+filename
- if not os.path.exists(path):
- path = "C:\\Users\\User\\Desktop\\20190306要素\\"+filename
- data.append([filename,codecs.open(path,'r',encoding="utf8").read()])
- index += 1
- #save(data,"namehtml.pk")
- def getTrainData(percent=0.9):
- conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- sql = "select filename,text,label,relabel,handlabel from form "
- cursor.execute(sql)
-
- rows = cursor.fetchall()
- save(rows,"filename_text_label_relabel_handlabel.pk")
- train_x = []
- train_y = []
- test_x = []
- test_y = []
- test_text = []
- for row in rows:
- input = str(row[1])
- label = str(int(row[2]))
- if row[4] is not None:
- label = str(int(row[4]))
- elif row[3] is not None:
- label = str(int(row[3]))
- item_y = [0,0]
- item_y[int(label)] = 1
- if np.random.random()<percent:
- # train_x.append(encodeInput(input))
- train_x.append(encodeInput([input], word_len=50, word_flag=True,userFool=False)[0])
- train_y.append(item_y)
- else:
- # test_x.append(encodeInput(input))
- test_x.append(encodeInput([input], word_len=50, word_flag=True,userFool=False)[0])
- test_y.append(item_y)
- test_text.append([row[0],input])
- return np.array(train_x),np.array(train_y),np.array(test_x),np.array(test_y),test_text
- def getTrainData_jsonTable(begin,end,return_text=False):
- def encode_table(inner_table,size=30):
- def encode_item(_table,i,j):
- _x = [_table[j-1][i-1],_table[j-1][i],_table[j-1][i+1],
- _table[j][i-1],_table[j][i],_table[j][i+1],
- _table[j+1][i-1],_table[j+1][i],_table[j+1][i+1]]
- e_x = [encodeInput_form(_temp[0],MAX_LEN=30) for _temp in _x]
- _label = _table[j][i][1]
- # print(_x)
- # print(_x[4],_label)
- return e_x,_label,_x
- def copytable(inner_table):
- table = []
- for line in inner_table:
- list_line = []
- for item in line:
- list_line.append([item[0][:size],item[1]])
- table.append(list_line)
- return table
- table = copytable(inner_table)
- padding = ["#"*30,0]
- width = len(table[0])
- height = len(table)
- table.insert(0,[padding for i in range(width)])
- table.append([padding for i in range(width)])
- for item in table:
- item.insert(0,padding.copy())
- item.append(padding.copy())
- data_x = []
- data_y = []
- data_text = []
- data_position = []
- for _i in range(1,width+1):
- for _j in range(1,height+1):
- _x,_y,_text = encode_item(table,_i,_j)
- data_x.append(_x)
- _label = [0,0]
- _label[_y] = 1
- data_y.append(_label)
- data_text.append(_text)
- data_position.append([_i-1,_j-1])
- # input = table[_j][_i][0]
- # item_y = [0,0]
- # item_y[table[_j][_i][1]] = 1
- # data_x.append(encodeInput([input], word_len=50, word_flag=True,userFool=False)[0])
- # data_y.append(item_y)
- return data_x,data_y,data_text,data_position
- def getDataSet(list_json_table,return_text=False):
- _count = 0
- _sum = len(list_json_table)
- data_x = []
- data_y = []
- data_text = []
- for json_table in list_json_table:
- _count += 1
- print("%d/%d"%(_count,_sum))
- table = json.loads(json_table)
- if table is not None:
- list_x,list_y,list_text = encode_table(table)
- data_x.extend(list_x)
- data_y.extend(list_y)
- if return_text:
- data_text.extend(list_text)
- return np.array(data_x),np.array(data_y),data_text
- save_path = "./traindata/websource_67000_table_%d-%d-%s.pk"%(begin,end,"1" if return_text else "0")
- if os.path.exists(save_path):
- data_x,data_y,data_text = load(save_path)
- else:
- df = pd.read_csv("../../dl_dev/form/traindata/websource_67000_table.csv", encoding="GBK")
- import json
- data_x,data_y,data_text = getDataSet(df["json_table"][begin:end],return_text=return_text)
- save((data_x,data_y,data_text),save_path)
- return data_x,data_y,data_text
- if __name__=="__main__":
- #getSourceData()
- #importData()
- #selectWithRule("data_item.xls", ["批量.xls"], "temp.xls")
- #importRelabel()
- # getHtml()
- getTrainData_jsonTable()
-
|