123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853 |
- import re
- from bs4 import BeautifulSoup, Comment
- import copy
- import sys
- import os
- import time
- sys.path.append(os.path.abspath("../.."))
- import fool
- from BiddingKG.dl.interface.Connection import *
- from BiddingKG.dl.common.Utils import *
- import BiddingKG.dl.interface.settings as settings
- from BiddingKG.dl.interface.Connection import getConnection
- from BiddingKG.dl.interface.Entitys import *
- from BiddingKG.dl.form.feature import encoding
- from BiddingKG.dl.interface.predictor import *
- formPredictor = FormPredictor()
- def tableToText(soup):
- '''
- @param:
- soup:网页html的soup
- @return:处理完表格信息的网页text
- '''
-
- def getTrs(tbody):
- #获取所有的tr
- trs = []
- objs = tbody.find_all(recursive=False)
- for obj in objs:
- if obj.name=="tr":
- trs.append(obj)
- if obj.name=="tbody":
- for tr in obj.find_all("tr",recursive=False):
- trs.append(tr)
- return trs
-
- def getTable(tbody):
- #trs = tbody.findChildren('tr', recursive=False)
- def add_punc(soup):
- # 对表格内部节点添加标点符号
- commaList = ["p","div","h1", "h2", "h3", "h4", "h5", "h6", "header", "dl", "ul", "label"]
- # 递归遍历所有节点,插入符号
- for child in soup.find_all(recursive=True):
- if child.name == 'br':
- child.insert_before(',')
- child_text = re.sub('\s', '', child.get_text())
- if child_text == '' or child_text[-1] in ['。',',',':',';']:
- continue
- if child.name in commaList:
- if len(child_text)>3 and len(child_text) <50: # 先判断是否字数少于50,成立加逗号,否则加句号
- child.insert_after(",")
- elif len(child_text) >=50:
- child.insert_after("。")
- return soup
- trs = getTrs(tbody)
- inner_table = []
- colspan = [] # 记录每个<td>标签的colspan 数
- rowspan = [] # 记录每个<td>标签的rowspan 数
- for tr in trs:
- tr_line = []
- tr_col = []
- tr_row = []
- tds = tr.findChildren(['td','th'], recursive=False)
- for td in tds:
- if len(td.find_all(['p','div','br','dl','ul'])) > 2:
- add_punc(td)
- if 'colspan' in td.attrs and td['colspan'].isdigit():
- tr_col.append(int(td['colspan']))
- else:
- tr_col.append(1)
- if 'rowspan' in td.attrs and td['rowspan'].isdigit():
- tr_row.append(int(td['rowspan']))
- else:
- tr_row.append(1)
- tr_line.append([re.sub('\s*','',td.get_text()),0])
- if tr_row == []:
- tr_row.append(1)
- tr_col.append(1)
- tr_line.append([re.sub('\s*','',tr.get_text()),0])
- inner_table.append(tr_line)
- colspan.append(tr_col)
- rowspan.append(tr_row)
- return inner_table, colspan, rowspan
- def fix_rowspan(inner_table, colspan, rowspan):
- # 思路2:遍历每行i,如果有colspan<3而且列数大于3 则补全colspan :
- # 遍历每行i 如果 rowspan > 1,先判断补全后列数是否相等,如果相等再补全。
- def is_same_item(list):
- flag = True
- item0 = list[0]
- for i in range(1,len(list)):
- if item0 != list[i]:
- flag = False
- break
- return flag
-
- for i in range(len(inner_table)): # 有rowspan需要跟下一行比较,所以要减1
- curent_row_total_colspan = 0
- if len(colspan[i]) >= 2:
- for w in range(len(colspan[i])):
- if colspan[i][w+curent_row_total_colspan] > 1 and (colspan[i][w+curent_row_total_colspan] < 3 or colspan[i][w+curent_row_total_colspan] <= len(colspan[i])):
- #if colspan[i][w+curent_row_total_colspan] > 1 and colspan[i][w+curent_row_total_colspan] < len(colspan[i]): # 修改加条件防止两列的需要补全2列不行
- for num in range(1, colspan[i][w+curent_row_total_colspan]):
- colspan[i].insert(w+num+curent_row_total_colspan, 1)
- rowspan[i].insert(w+num+curent_row_total_colspan, rowspan[i][w+curent_row_total_colspan])
- inner_table[i].insert(w+num+curent_row_total_colspan, copy.deepcopy(inner_table[i][w+curent_row_total_colspan]))
- colspan[i][w+curent_row_total_colspan] = 1
- curent_row_total_colspan += num
- for i in range(len(inner_table) -1): # 有rowspan需要跟下一行比较,所以要减1
- if is_same_item(rowspan[i]):
- continue
- for j in range(len(rowspan[i])): # 最后一列不用补全
- if rowspan[i][j] > 1 and rowspan[i][j]<= len(inner_table) :
- if sum(colspan[i]) == sum(colspan[i+1]) + colspan[i][j] or sum(colspan[i]) == sum(colspan[i+1]) + len([span for span in rowspan[i] if span > 1]):
- rowspan[i+1].insert(j, rowspan[i][j]-1)
- colspan[i+1].insert(j, 1)
- inner_table[i+1].insert(j, copy.deepcopy(inner_table[i][j]))
- rowspan[i][j] = 1
- return inner_table, colspan, rowspan
-
- #设置表头
- def setHead(inner_table, prob_min=0.5):
- # 思路:先按照列数是否一致进行分段,然后在每个分段里面寻找行表头,找出所有的行表头后
- # 再在每个分段行表头之间和最后一个行表头和分段末尾之间找列表头。
- def del_continuous_value(l): # 去除连续重复性再识别表头
- new_list = []
- new_list.append(l[0])
- for item in l:
- if item != new_list[-1]:
- new_list.append(item)
- return new_list
-
- def find_diflen(l):
- # 找到连续相同列数区间进行分段
- area = [0]
- temp = l[0]
- for i in range(len(l)):
- if temp != l[i]:
- area.append(i-1)
- area.append(i)
- temp = l[i]
- area.append(len(l)-1)
- return area
-
- l = [len(tr) for tr in inner_table] # 计算每行列数
- diff = find_diflen(l) # 得到不同列数区间
- height = len(inner_table)
- rowHeader = []
- head_list = []
- area_end_index = []
- for index in range(0,len(diff),2):
- area_begin = diff[index]
- area_end = diff[index+1]
- head_list.append(area_begin)
- #行表头
- has_row_head = False
- for i in range(area_begin, area_end+1): # 在区间内找行表头
- if i == area_end: # 区域内最后一行不作为表头
- continue
- if [item[0] for item in inner_table[i] if len(item[0]) > 20] != []: # 如果某列长度大于20直接判断为非表头
- continue
- width = len(inner_table[i])
- is_row_head = False
- #item_set = set([item[0] for item in inner_table[i] if item[0] != ''])
- item_set = [item[0] for item in inner_table[i]]
- item_set = del_continuous_value(item_set)
- form_prob = formPredictor.predict(encoding('|'.join(item_set),expand=True),type="line")
- if form_prob is not None:
- if form_prob[0][1]>prob_min:
- is_row_head = True
- else:
- is_row_head = False
- #if fool.ner('|'.join(item_set)) != [[]]:
- #is_row_head = False
- if is_row_head:
- head_list.append(i)
- rowHeader.append(i)
- has_row_head = True
- #print('补1前 ',inner_table[i])
- for j in range(width):
- inner_table[i][j][1] = 1
- #print(inner_table[i])
- head_list.append(area_end)
- if has_row_head:
- area_end_index.append(area_end)
- rowHeader.append(area_end)
- for index in range(len(rowHeader)-1): # 在行表头出现区间找列表头
- if rowHeader[index] in area_end_index:
- continue
- width = len(inner_table[rowHeader[index]])
- for i in range(width-1): # 最后一列不做列表头
- is_head = False
- #predict is head or not with model
- temp_item = []
- for j in range(rowHeader[index],rowHeader[index+1]+1): #从行表头开始找列表头
- temp_item.append(inner_table[j][i][0])
- #item_set = set([item for item in temp_item if item != ''])
- item_set = [item for item in temp_item]
- item_set = del_continuous_value(item_set)
- form_prob = formPredictor.predict(encoding('|'.join(item_set),expand=True),type="line")
- if form_prob is not None:
- if form_prob[0][1]>prob_min:
- is_head = True
- else:
- is_head = False
- if is_head:
- for j in range(rowHeader[index]+1,rowHeader[index+1]+1): #从行表头的下一行开始设置列表头
- inner_table[j][i][1] = 2
- return inner_table,diff,area_end_index
- #取得表格的处理方向
- def getDirect(inner_table,begin,end):
- column_head = set()
- row_head = set()
- widths = len(inner_table[begin])
- for height in range(begin,end):
- for width in range(widths):
- if inner_table[height][width][1] ==1:
- row_head.add(height)
- if inner_table[height][width][1] ==2:
- column_head.add(width)
- company_pattern = re.compile("公司")
- #if 0 in column_head and begin not in row_head:
- if widths == 1 and begin != end:
- return "column"
- if 0 in column_head and begin in row_head:
- for height in range(begin,end):
- count = 0
- count_flag = True
- for width_index in range(width):
- if inner_table[height][width_index][1]==0:
- if re.search(company_pattern,inner_table[height][width_index][0]) is not None:
- count += 1
- else:
- count_flag = False
- if count_flag and count>=2:
- return "column"
- return "row"
-
- #根据表格处理方向生成句子,
- def getTableText(inner_table,head_list,area_end_index):
- # 思路在各个分段内内容往上找表头,如果找到就插入表头,有关键词的排在最前面
- rankPattern = "(排名|排序|名次|评标结果|评审结果)"
- entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
- height = len(inner_table)
- text = ""
-
- for index in range(0,len(head_list),2):
- head_begin = head_list[index]
- head_end = head_list[index+1]
- direct = getDirect(inner_table, head_begin, head_end)
- if direct=="row":
- #行表头
- has_row_head = False
- for i in range(head_begin,head_end+1):
- width = len(inner_table[i])
- rank_text = ""
- entity_text = ""
- text_line = ""
- #在同一句话中重复的可以去掉
- text_set = set()
- for j in range(width):
- cell = inner_table[i][j]
- #是属性值
- if cell[1]==0:
- head = ""
- find_flag = False
- temp_head = ""
- for loop_j in range(1,j+1):
- if inner_table[i][j-loop_j][1]==2:
- if find_flag:
- if inner_table[i][j-loop_j][0]!=temp_head:
- head = inner_table[i][j-loop_j][0]+":"+head
- else:
- head = inner_table[i][j-loop_j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i][j-loop_j][0]
- else:
- if find_flag:
- break
- find_flag = False
- temp_head = ""
- if i > 0:
- for loop_i in range(i-1, head_begin-1, -1): # 修改为从开始位置往上找
- if inner_table[loop_i][j][1]==1:
- if find_flag:
- if inner_table[loop_i][j][0]!=temp_head:
- head = inner_table[loop_i][j][0]+":"+head
- else:
- head = inner_table[loop_i][j][0]+":"+head
- find_flag = True
- temp_head = inner_table[loop_i][j][0]
- else:
- #找到表头后遇到属性值就返回
- if find_flag:
- break
- if str(head+inner_table[i][j][0]) in text_set:
- continue
- if re.search(rankPattern,head) is not None:
-
- rank_text += head+inner_table[i][j][0]+","
- #print(rank_text)
- elif re.search(entityPattern,head) is not None:
- entity_text += head+inner_table[i][j][0]+","
- #print(entity_text)
- else:
- text_line += head+inner_table[i][j][0]+","
- text_set.add(str(head+inner_table[i][j][0]))
- text += rank_text+entity_text+text_line
- #print(re.sub('\s', '', rank_text+entity_text+text_line))
- text = text[:-1]+"," if len(re.sub('\s', '', rank_text+entity_text+text_line)) < 20 else text[:-1]+"。" # 修改为短句加逗号
- #print(text)
- else:
- #列表头
- has_row_head = False
- width = len(inner_table[head_begin])
- for j in range(width):
- #for i in range(head_begin,head_end+1):
- #width = len(inner_table[i])
- rank_text = ""
- entity_text = ""
- text_line = ""
- #在同一句话中重复的可以去掉
- text_set = set()
- for i in range(head_begin,head_end+1):
- #for j in range(width):
- cell = inner_table[i][j]
- #是属性值
- if cell[1]==0:
- head = ""
- find_flag = False
- temp_head = ""
- if i > 0:
- for loop_i in range(i-1, head_begin-1, -1): # 修改为从开始位置往上找
- if inner_table[loop_i][j][1]==1:
- if find_flag:
- if inner_table[loop_i][j][0]!=temp_head:
- head = inner_table[loop_i][j][0]+":"+head
- else:
- head = inner_table[loop_i][j][0]+":"+head
- find_flag = True
- temp_head = inner_table[loop_i][j][0]
- else:
- #找到表头后遇到属性值就返回
- if find_flag:
- break
- find_flag = False
- temp_head = ""
- for loop_j in range(1,j+1): # 找列表头
- if inner_table[i][j-loop_j][1]==2:
- if find_flag:
- if inner_table[i][j-loop_j][0]!=temp_head:
- head = inner_table[i][j-loop_j][0]+":"+head
- else:
- head = inner_table[i][j-loop_j][0]+":"+head
- find_flag = True
- temp_head = inner_table[i][j-loop_j][0]
- else:
- if find_flag:
- break
-
- if str(head+inner_table[i][j][0]) in text_set:
- continue
- if re.search(rankPattern,head) is not None:
-
- rank_text += head+inner_table[i][j][0]+","
- #print(rank_text)
- elif re.search(entityPattern,head) is not None:
- entity_text += head+inner_table[i][j][0]+","
- #print(entity_text)
- else:
- text_line += head+inner_table[i][j][0]+","
- text_set.add(str(head+inner_table[i][j][0]))
- text += rank_text+entity_text+text_line
- #print(re.sub('\s', '', rank_text+entity_text+text_line))
- text = text[:-1]+"," if len(re.sub('\s', '', rank_text+entity_text+text_line)) < 20 else text[:-1]+"。" # 修改为短句加逗号
-
- return text
- def trunTable(tbody):
- inner_table, colspan, rowspan = getTable(tbody)
- inner_table, colspan, rowspan = fix_rowspan(inner_table, colspan, rowspan)
- if len(inner_table)>0 and len(inner_table[0])>0:
- inner_table,head_list,area_end_index = setHead(inner_table)
- tbody.string = getTableText(inner_table,head_list,area_end_index)
- #print(tbody.string)
- tbody.name = "table"
- return inner_table # 临时修改,保存中间变量
-
- pat_head = re.compile('(名称|序号|项目|标项|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理|制造)')
- #pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
- pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
-
- tbodies = soup.find_all('table')
- # 遍历表格中的每个tbody
-
- tables = [] # 临时添加
-
- #逆序处理嵌套表格
- for tbody_index in range(1,len(tbodies)+1):
- tbody = tbodies[len(tbodies)-tbody_index]
- #trunTable(tbody)
- inner_table = trunTable(tbody) # 临时修改,保存中间变量
- tables.append(inner_table)
-
- tbodies = soup.find_all('tbody')
- # 遍历表格中的每个tbody
- #逆序处理嵌套表格
- for tbody_index in range(1,len(tbodies)+1):
- tbody = tbodies[len(tbodies)-tbody_index]
- #trunTable(tbody)
- inner_table = trunTable(tbody) # 临时修改,保存中间变量
- tables.append(inner_table)
- #return soup
- return soup, tables # 临时修改
- #数据清洗
- def segment(soup):
- segList = ["title"]
- commaList = ["p","div","h1", "h2", "h3", "h4", "h5", "h6", "header", "dl", "ul", "label"]
- spaceList = ["span"]
- tbodies = soup.find_all('tbody')
- if len(tbodies) == 0:
- tbodies = soup.find_all('table')
- # 递归遍历所有节点,插入符号
- for child in soup.find_all(recursive=True):
- if child.name == 'br':
- child.insert_before(',')
- child_text = re.sub('\s', '', child.get_text())
- if child_text == '' or child_text[-1] in ['。',',',':',';']:
- continue
- if child.name in segList:
- child.insert_after("。")
- if child.name in commaList:
- if len(child_text)>3 and len(child_text) <50: # 先判断是否字数少于50,成立加逗号,否则加句号
- child.insert_after(",")
- elif len(child_text) >=50:
- child.insert_after("。")
- #if child.name in spaceList:
- #child.insert_after(" ")
- text = str(soup.get_text())
-
- #替换"""为"“",否则导入deepdive出错
- text = text.replace('"',"“")
- #text = text.replace('"',"“").replace("\r","").replace("\n","")
-
- #删除所有空格
- text = re.sub("\s+","#nbsp#",text)
- text_list = text.split('#nbsp#')
- new_text = ''
- for i in range(len(text_list)-1):
- if text_list[i] == '' or text_list[i][-1] in [',','。',';',':']:
- new_text += text_list[i]
- elif re.findall('([一二三四五六七八九]、)', text_list[i+1][:4]) != []:
- new_text += text_list[i] + '。'
- elif re.findall('([0-9]、)', text_list[i+1][:4]) != []:
- new_text += text_list[i] + ';'
- elif text_list[i].isdigit() and text_list[i+1].isdigit():
- new_text += text_list[i] + ' '
- elif text_list[i][-1] in ['-',':','(',')','/','(',')','——','年','月','日','时','分','¥'] or text_list[i+1][0] in ['-',':','(',')','/','(',')','——','年','月','日','时','分','元','万元']:
- new_text += text_list[i]
- elif len(text_list[i]) >= 3 and len(text_list[i+1]) >= 3:
- new_text += text_list[i] + ','
- else:
- new_text += text_list[i]
- new_text += text_list[-1]
- text = new_text
- #替换英文冒号为中文冒号
- text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
- #替换为中文逗号
- text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
- #替换为中文分号
- text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
-
- #替换标点
- while(True):
- #替换连续的标点
- punc = re.search(",(?P<punc>:|。|,|;)\s*",text)
- if punc is not None:
- text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text)
-
- punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
- if punc is not None:
- text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
- else:
- #替换标点之后的空格
- punc = re.search("(?P<punc>:|。|,|;)\s+",text)
- if punc is not None:
- text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
- else:
- break
- #将连续的中文句号替换为一个
- text_split = text.split("。")
- text_split = [x for x in text_split if len(x)>0]
- text = "。".join(text_split)
-
- return text
- def union_ner(list_ner):
- result_list = []
- union_index = []
- union_index_set = set()
- for i in range(len(list_ner)-1):
- if len(set([str(list_ner[i][2]),str(list_ner[i+1][2])])&set(["org","company"]))==2:
- if list_ner[i][1]-list_ner[i+1][0]==1:
- union_index_set.add(i)
- union_index_set.add(i+1)
- union_index.append((i,i+1))
- for i in range(len(list_ner)):
- if i not in union_index_set:
- result_list.append(list_ner[i])
- for item in union_index:
- #print(str(list_ner[item[0]][3])+str(list_ner[item[1]][3]))
- result_list.append((list_ner[item[0]][0],list_ner[item[1]][1],'company',str(list_ner[item[0]][3])+str(list_ner[item[1]][3])))
- return result_list
-
-
- def getTokensAndNers(sentences,MAXAREA = 100000):
- '''
- @param: sentences:句子数
- @return 限流执行后的分词和实体识别list
- '''
-
- def getData(tokens,ners,process_data):
- process_sentences = [item[1] for item in process_data]
- token_ = fool.cut(process_sentences)
- ner_ = fool.ner(process_sentences)
- for i in range(len(token_)):
- the_index = process_data[i][0]
- tokens[the_index] = token_[i]
- ners[the_index] = ner_[i]
- sents = []
- for i in range(len(sentences)):
- sents.append([i,sentences[i]])
- sents.sort(key=lambda x:len(x[1]),reverse=True)
- index_ = 0
- tokens = [[]for i in range(len(sentences))]
- ners = [[]for i in range(len(sentences))]
-
- while(True):
- width = len(sents[index_][1])
- height = MAXAREA//width+1
- if height>len(sents)-index_:
- height = len(sents)-index_
- process_data = sents[index_:index_+height]
- getData(tokens, ners, process_data)
- index_ += height
- if index_>=len(sents):
- break
- return tokens,ners
-
- def get_articles_processed(articles):
- '''
- @summary:预处理步骤,NLP处理、实体识别
- @param:
- articles:待处理的文章list
- @return:list of articles,list of each article of sentences,list of each article of entitys
- '''
- list_articles = []
- list_sentences = []
- list_entitys = []
- for article in articles:
- list_sentences_temp = []
- list_entitys_temp = []
- doc_id = article[0]
- #表格处理
- #article_processed = segment(tableToText(BeautifulSoup(article[1],"lxml")))
-
- soup, tables = tableToText(BeautifulSoup(article[1],"lxml"))
- article_processed = segment(soup)
-
- #list_articles.append([doc_id,article_processed, article[1],tables]) # 临时修改,保存比较处理前和处理后结果
- #return list_articles # 临时修改,保存比较处理前和处理后结果
-
- list_articles.append(Article(doc_id,article_processed))
- #nlp处理
- if article_processed is not None and len(article_processed)!=0:
- split_patten = "。"
- sentences = re.split(split_patten,article_processed)
- sentences = [x for x in sentences if len(x)!=0]
-
- lemmas = []
- doc_offsets = []
- dep_types = []
- dep_tokens = []
-
- time1 = time.time()
-
- '''
- tokens_all = fool.cut(sentences)
- #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
- #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
- ner_entitys_all = fool.ner(sentences)
- '''
- #限流执行
- tokens_all,ner_entitys_all = getTokensAndNers(sentences)
-
- print("nlp:",time.time()-time1)
-
- for sentence_index in range(len(sentences)):
-
-
-
- list_sentence_entitys = []
- sentence_text = sentences[sentence_index]
- tokens = tokens_all[sentence_index]
-
- list_tokenbegin = []
- begin = 0
- for i in range(0,len(tokens)):
- list_tokenbegin.append(begin)
- begin += len(str(tokens[i]))
- list_tokenbegin.append(begin+1)
- #pos_tag = pos_all[sentence_index]
- pos_tag = ""
-
- ner_entitys = ner_entitys_all[sentence_index]
-
- list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=sentence_index,sentence_text=sentence_text,tokens=tokens,pos_tags=pos_tag,ner_tags=ner_entitys))
-
- #识别实体
- for ner_entity in ner_entitys:
- begin_index_temp = ner_entity[0]
- entity_type = ner_entity[2]
- entity_text = ner_entity[3]
-
- for j in range(len(list_tokenbegin)):
- if list_tokenbegin[j]==begin_index_temp:
- begin_index = j
- break
- elif list_tokenbegin[j]>begin_index_temp:
- begin_index = j-1
- break
- begin_index_temp += len(str(entity_text))
- for j in range(begin_index,len(list_tokenbegin)):
- if list_tokenbegin[j]>=begin_index_temp:
- end_index = j-1
- break
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
-
- list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index))
-
-
- #使用正则识别金额
- entity_type = "money"
-
- #money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[\(\)()元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[¥¥]+,?|报价|标价)[(\(]?([万])?元?[)\)]?[::]?.{,7}?([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?)|([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?)[\((]?([万元]{1,2}))*"
-
- list_money_pattern = {"cn":"(()()([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})())*",
- "key_word":"((?:[¥¥]+,?|报价|标价)(?:[(\(]?\s*([万元]*)\s*[)\)]?)\s*[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分]{,7}?)([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())*",
- "front_m":"((?:[(\(]?\s*([万元]+)\s*[)\)])\s*[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分]{,7}?)([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())*",
- "behind_m":"(()()([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?([万元]+)[\))]?)*"}
-
- set_begin = set()
- for pattern_key in list_money_pattern.keys():
- pattern = re.compile(list_money_pattern[pattern_key])
- all_match = re.findall(pattern, sentence_text)
- index = 0
- for i in range(len(all_match)):
- if len(all_match[i][0])>0:
- #print(all_match[i][0])
- unit = ""
- entity_text = all_match[i][3]
- if pattern_key in ["key_word","front_m"]:
- unit = all_match[i][1]
- else:
- unit = all_match[i][4]
- if entity_text.find("元")>=0:
- unit = ""
-
- index += len(all_match[i][0])-len(entity_text)-len(all_match[i][4])#-len(all_match[i][1])-len(all_match[i][2])#整个提出来的作为实体->数字部分作为整体,否则会丢失特征
-
- for j in range(len(list_tokenbegin)):
- if list_tokenbegin[j]==index:
- begin_index = j
- break
- elif list_tokenbegin[j]>index:
- begin_index = j-1
- break
- index += len(str(entity_text))+len(all_match[i][4])#+len(all_match[i][2])+len(all_match[i][1])#整个提出来的作为实体
- #index += len(str(all_match[i][0]))
- for j in range(begin_index,len(list_tokenbegin)):
- if list_tokenbegin[j]>=index:
- end_index = j-1
- break
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
-
-
- entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]","",entity_text)
- if len(unit)>0:
- entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0]))
- else:
- entity_text = str(getUnifyMoney(entity_text))
-
-
- list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index))
-
- else:
- index += 1
-
- list_sentence_entitys.sort(key=lambda x:x.begin_index)
- list_entitys_temp = list_entitys_temp+list_sentence_entitys
- list_sentences.append(list_sentences_temp)
- list_entitys.append(list_entitys_temp)
- return list_articles,list_sentences,list_entitys
-
- def union_result(codeName,prem):
- '''
- @summary:模型的结果拼成字典
- @param:
- codeName:编号名称模型的结果字典
- prem:拿到属性的角色的字典
- @return:拼接起来的字典
- '''
- result = []
- assert len(codeName)==len(prem)
- for item_code,item_prem in zip(codeName,prem):
- if item_code[0]==item_prem[0]:
- result.append([item_code[0],dict(item_code[1],**item_prem[1])])
- return result
- def persistenceData(data):
- '''
- @summary:将中间结果保存到数据库-线上生产的时候不需要执行
- '''
- import psycopg2
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- for item_index in range(len(data)):
- item = data[item_index]
- doc_id = item[0]
- dic = item[1]
- code = dic['code']
- name = dic['name']
- prem = dic['prem']
- if len(code)==0:
- code_insert = ""
- else:
- code_insert = ";".join(code)
- prem_insert = ""
- for item in prem:
- for x in item:
- if isinstance(x, list):
- if len(x)>0:
- for x1 in x:
- prem_insert+="/".join(x1)+","
- prem_insert+="$"
- else:
- prem_insert+=str(x)+"$"
- prem_insert+=";"
- sql = " insert into predict_validation(doc_id,code,name,prem) values('"+doc_id+"','"+code_insert+"','"+name+"','"+prem_insert+"')"
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- def persistenceData1(list_entitys,list_sentences):
- '''
- @summary:将中间结果保存到数据库-线上生产的时候不需要执行
- '''
- import psycopg2
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- for list_entity in list_entitys:
- for entity in list_entity:
- if entity.values is not None:
- sql = " insert into predict_entity(entity_id,entity_text,entity_type,doc_id,sentence_index,begin_index,end_index,label,values) values('"+str(entity.entity_id)+"','"+str(entity.entity_text)+"','"+str(entity.entity_type)+"','"+str(entity.doc_id)+"',"+str(entity.sentence_index)+","+str(entity.begin_index)+","+str(entity.end_index)+","+str(entity.label)+",array"+str(entity.values)+")"
- else:
- sql = " insert into predict_entity(entity_id,entity_text,entity_type,doc_id,sentence_index,begin_index,end_index) values('"+str(entity.entity_id)+"','"+str(entity.entity_text)+"','"+str(entity.entity_type)+"','"+str(entity.doc_id)+"',"+str(entity.sentence_index)+","+str(entity.begin_index)+","+str(entity.end_index)+")"
- cursor.execute(sql)
- for list_sentence in list_sentences:
- for sentence in list_sentence:
- str_tokens = "["
- for item in sentence.tokens:
- str_tokens += "'"
- if item=="'":
- str_tokens += "''"
- else:
- str_tokens += item
- str_tokens += "',"
- str_tokens = str_tokens[:-1]+"]"
- sql = " insert into predict_sentences(doc_id,sentence_index,tokens) values('"+sentence.doc_id+"',"+str(sentence.sentence_index)+",array"+str_tokens+")"
- cursor.execute(sql)
- conn.commit()
- conn.close()
- if __name__=="__main__":
- import glob
- import re
- #files = glob.glob( 'F:/工作文档/实体识别实体对其//20190320/*.html')
- #files = glob.glob( 'F:/工作文档/实体识别实体对其//20190306/*.html')
- #files = glob.glob( 'F:/工作文档/实体识别实体对其//20190513/*.html')
- #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58466066.html')
- #files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58447523.html')
- #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58511386.html')
- #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58521609.html')
- #files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58502967.html') # 内容缺失
- #files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58445908.html') # 把采购人、单位名称识别为表头
- #files = glob.glob('F:/工作文档/实体识别实体对其/20190416要素\\比地_101_61320687.html') #行表头识别不到
- #files = glob.glob('F:/工作文档/实体识别实体对其/20190306\\比地_52_57131306.html') # span 空格区分不了
- #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58522893.html') # 某行tr没有td
- files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58447523.html')
- #files = glob.glob('F:/工作文档/实体识别实体对其/test/*.html')
- #files = ['F:/工作文档/实体识别实体对其/1.html']
- print(len(files))
- i = 0
- filePaths =[]
- for file in files:
- with open(file, 'r', encoding='utf-8') as f:
- content = f.read()
- filePaths.append([file, content])
- #tables = re.findall('<table[^<].*?</table>', re.sub('\s','',content))
- ##if len(tables) == 0 and re.search('采购人', content) != None and re.search('代理机构',content) != None and re.search(' ',content) != None:
- ##filePaths.append([file, content])
- #for table in tables:
- #if re.search('排序', table) != None or re.search('名次',table) != None\
- #or re.search('排名', table) != None:
- ##if re.search('colspan', table) != None and re.search('rowspan',table) != None and re.search('第一中标', table) != None:
- #filePaths.append([file, content])
- #break
- list_articles = get_articles_processed(filePaths)
- #list_articles,list_sentences,list_entitys = get_articles_processed(filePaths)
- with open('F:/工作文档/实体识别实体对其/20190416要素/list_articles_20190306.pkl', 'wb') as f:
- pickle.dump(list_articles, f)
- print(len(list_articles))
-
-
- #doc_id = "09067598-7076-11e8-9dae-52540087e52f"
- #import psycopg2
- #conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- #cursor = conn.cursor()
- #sql = " select id,content from articles where id='"+doc_id+"' "
- #cursor.execute(sql)
- #ContentIDs = cursor.fetchall()
- #list_articles,list_sentences,list_entitys = get_articles_processed(ContentIDs)
- #for i in range(len(list_entitys)):
- #for j in range(len(list_entitys[i])):
- #entity = list_entitys[i][j]
- #sentence = list_sentences[i][entity.sentence_index]
- #tokens = sentence.tokens
- #begin_index = entity.begin_index
- #end_index = entity.end_index
- #if entity.entity_type in ['org','company']:
- #item_x = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1])
- ##print(item_x)
-
|