Preprocessing1.py 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853
  1. import re
  2. from bs4 import BeautifulSoup, Comment
  3. import copy
  4. import sys
  5. import os
  6. import time
  7. sys.path.append(os.path.abspath("../.."))
  8. import fool
  9. from BiddingKG.dl.interface.Connection import *
  10. from BiddingKG.dl.common.Utils import *
  11. import BiddingKG.dl.interface.settings as settings
  12. from BiddingKG.dl.interface.Connection import getConnection
  13. from BiddingKG.dl.interface.Entitys import *
  14. from BiddingKG.dl.form.feature import encoding
  15. from BiddingKG.dl.interface.predictor import *
  16. formPredictor = FormPredictor()
  17. def tableToText(soup):
  18. '''
  19. @param:
  20. soup:网页html的soup
  21. @return:处理完表格信息的网页text
  22. '''
  23. def getTrs(tbody):
  24. #获取所有的tr
  25. trs = []
  26. objs = tbody.find_all(recursive=False)
  27. for obj in objs:
  28. if obj.name=="tr":
  29. trs.append(obj)
  30. if obj.name=="tbody":
  31. for tr in obj.find_all("tr",recursive=False):
  32. trs.append(tr)
  33. return trs
  34. def getTable(tbody):
  35. #trs = tbody.findChildren('tr', recursive=False)
  36. def add_punc(soup):
  37. # 对表格内部节点添加标点符号
  38. commaList = ["p","div","h1", "h2", "h3", "h4", "h5", "h6", "header", "dl", "ul", "label"]
  39. # 递归遍历所有节点,插入符号
  40. for child in soup.find_all(recursive=True):
  41. if child.name == 'br':
  42. child.insert_before(',')
  43. child_text = re.sub('\s', '', child.get_text())
  44. if child_text == '' or child_text[-1] in ['。',',',':',';']:
  45. continue
  46. if child.name in commaList:
  47. if len(child_text)>3 and len(child_text) <50: # 先判断是否字数少于50,成立加逗号,否则加句号
  48. child.insert_after(",")
  49. elif len(child_text) >=50:
  50. child.insert_after("。")
  51. return soup
  52. trs = getTrs(tbody)
  53. inner_table = []
  54. colspan = [] # 记录每个<td>标签的colspan 数
  55. rowspan = [] # 记录每个<td>标签的rowspan 数
  56. for tr in trs:
  57. tr_line = []
  58. tr_col = []
  59. tr_row = []
  60. tds = tr.findChildren(['td','th'], recursive=False)
  61. for td in tds:
  62. if len(td.find_all(['p','div','br','dl','ul'])) > 2:
  63. add_punc(td)
  64. if 'colspan' in td.attrs and td['colspan'].isdigit():
  65. tr_col.append(int(td['colspan']))
  66. else:
  67. tr_col.append(1)
  68. if 'rowspan' in td.attrs and td['rowspan'].isdigit():
  69. tr_row.append(int(td['rowspan']))
  70. else:
  71. tr_row.append(1)
  72. tr_line.append([re.sub('\s*','',td.get_text()),0])
  73. if tr_row == []:
  74. tr_row.append(1)
  75. tr_col.append(1)
  76. tr_line.append([re.sub('\s*','',tr.get_text()),0])
  77. inner_table.append(tr_line)
  78. colspan.append(tr_col)
  79. rowspan.append(tr_row)
  80. return inner_table, colspan, rowspan
  81. def fix_rowspan(inner_table, colspan, rowspan):
  82. # 思路2:遍历每行i,如果有colspan<3而且列数大于3 则补全colspan :
  83. # 遍历每行i 如果 rowspan > 1,先判断补全后列数是否相等,如果相等再补全。
  84. def is_same_item(list):
  85. flag = True
  86. item0 = list[0]
  87. for i in range(1,len(list)):
  88. if item0 != list[i]:
  89. flag = False
  90. break
  91. return flag
  92. for i in range(len(inner_table)): # 有rowspan需要跟下一行比较,所以要减1
  93. curent_row_total_colspan = 0
  94. if len(colspan[i]) >= 2:
  95. for w in range(len(colspan[i])):
  96. if colspan[i][w+curent_row_total_colspan] > 1 and (colspan[i][w+curent_row_total_colspan] < 3 or colspan[i][w+curent_row_total_colspan] <= len(colspan[i])):
  97. #if colspan[i][w+curent_row_total_colspan] > 1 and colspan[i][w+curent_row_total_colspan] < len(colspan[i]): # 修改加条件防止两列的需要补全2列不行
  98. for num in range(1, colspan[i][w+curent_row_total_colspan]):
  99. colspan[i].insert(w+num+curent_row_total_colspan, 1)
  100. rowspan[i].insert(w+num+curent_row_total_colspan, rowspan[i][w+curent_row_total_colspan])
  101. inner_table[i].insert(w+num+curent_row_total_colspan, copy.deepcopy(inner_table[i][w+curent_row_total_colspan]))
  102. colspan[i][w+curent_row_total_colspan] = 1
  103. curent_row_total_colspan += num
  104. for i in range(len(inner_table) -1): # 有rowspan需要跟下一行比较,所以要减1
  105. if is_same_item(rowspan[i]):
  106. continue
  107. for j in range(len(rowspan[i])): # 最后一列不用补全
  108. if rowspan[i][j] > 1 and rowspan[i][j]<= len(inner_table) :
  109. if sum(colspan[i]) == sum(colspan[i+1]) + colspan[i][j] or sum(colspan[i]) == sum(colspan[i+1]) + len([span for span in rowspan[i] if span > 1]):
  110. rowspan[i+1].insert(j, rowspan[i][j]-1)
  111. colspan[i+1].insert(j, 1)
  112. inner_table[i+1].insert(j, copy.deepcopy(inner_table[i][j]))
  113. rowspan[i][j] = 1
  114. return inner_table, colspan, rowspan
  115. #设置表头
  116. def setHead(inner_table, prob_min=0.5):
  117. # 思路:先按照列数是否一致进行分段,然后在每个分段里面寻找行表头,找出所有的行表头后
  118. # 再在每个分段行表头之间和最后一个行表头和分段末尾之间找列表头。
  119. def del_continuous_value(l): # 去除连续重复性再识别表头
  120. new_list = []
  121. new_list.append(l[0])
  122. for item in l:
  123. if item != new_list[-1]:
  124. new_list.append(item)
  125. return new_list
  126. def find_diflen(l):
  127. # 找到连续相同列数区间进行分段
  128. area = [0]
  129. temp = l[0]
  130. for i in range(len(l)):
  131. if temp != l[i]:
  132. area.append(i-1)
  133. area.append(i)
  134. temp = l[i]
  135. area.append(len(l)-1)
  136. return area
  137. l = [len(tr) for tr in inner_table] # 计算每行列数
  138. diff = find_diflen(l) # 得到不同列数区间
  139. height = len(inner_table)
  140. rowHeader = []
  141. head_list = []
  142. area_end_index = []
  143. for index in range(0,len(diff),2):
  144. area_begin = diff[index]
  145. area_end = diff[index+1]
  146. head_list.append(area_begin)
  147. #行表头
  148. has_row_head = False
  149. for i in range(area_begin, area_end+1): # 在区间内找行表头
  150. if i == area_end: # 区域内最后一行不作为表头
  151. continue
  152. if [item[0] for item in inner_table[i] if len(item[0]) > 20] != []: # 如果某列长度大于20直接判断为非表头
  153. continue
  154. width = len(inner_table[i])
  155. is_row_head = False
  156. #item_set = set([item[0] for item in inner_table[i] if item[0] != ''])
  157. item_set = [item[0] for item in inner_table[i]]
  158. item_set = del_continuous_value(item_set)
  159. form_prob = formPredictor.predict(encoding('|'.join(item_set),expand=True),type="line")
  160. if form_prob is not None:
  161. if form_prob[0][1]>prob_min:
  162. is_row_head = True
  163. else:
  164. is_row_head = False
  165. #if fool.ner('|'.join(item_set)) != [[]]:
  166. #is_row_head = False
  167. if is_row_head:
  168. head_list.append(i)
  169. rowHeader.append(i)
  170. has_row_head = True
  171. #print('补1前 ',inner_table[i])
  172. for j in range(width):
  173. inner_table[i][j][1] = 1
  174. #print(inner_table[i])
  175. head_list.append(area_end)
  176. if has_row_head:
  177. area_end_index.append(area_end)
  178. rowHeader.append(area_end)
  179. for index in range(len(rowHeader)-1): # 在行表头出现区间找列表头
  180. if rowHeader[index] in area_end_index:
  181. continue
  182. width = len(inner_table[rowHeader[index]])
  183. for i in range(width-1): # 最后一列不做列表头
  184. is_head = False
  185. #predict is head or not with model
  186. temp_item = []
  187. for j in range(rowHeader[index],rowHeader[index+1]+1): #从行表头开始找列表头
  188. temp_item.append(inner_table[j][i][0])
  189. #item_set = set([item for item in temp_item if item != ''])
  190. item_set = [item for item in temp_item]
  191. item_set = del_continuous_value(item_set)
  192. form_prob = formPredictor.predict(encoding('|'.join(item_set),expand=True),type="line")
  193. if form_prob is not None:
  194. if form_prob[0][1]>prob_min:
  195. is_head = True
  196. else:
  197. is_head = False
  198. if is_head:
  199. for j in range(rowHeader[index]+1,rowHeader[index+1]+1): #从行表头的下一行开始设置列表头
  200. inner_table[j][i][1] = 2
  201. return inner_table,diff,area_end_index
  202. #取得表格的处理方向
  203. def getDirect(inner_table,begin,end):
  204. column_head = set()
  205. row_head = set()
  206. widths = len(inner_table[begin])
  207. for height in range(begin,end):
  208. for width in range(widths):
  209. if inner_table[height][width][1] ==1:
  210. row_head.add(height)
  211. if inner_table[height][width][1] ==2:
  212. column_head.add(width)
  213. company_pattern = re.compile("公司")
  214. #if 0 in column_head and begin not in row_head:
  215. if widths == 1 and begin != end:
  216. return "column"
  217. if 0 in column_head and begin in row_head:
  218. for height in range(begin,end):
  219. count = 0
  220. count_flag = True
  221. for width_index in range(width):
  222. if inner_table[height][width_index][1]==0:
  223. if re.search(company_pattern,inner_table[height][width_index][0]) is not None:
  224. count += 1
  225. else:
  226. count_flag = False
  227. if count_flag and count>=2:
  228. return "column"
  229. return "row"
  230. #根据表格处理方向生成句子,
  231. def getTableText(inner_table,head_list,area_end_index):
  232. # 思路在各个分段内内容往上找表头,如果找到就插入表头,有关键词的排在最前面
  233. rankPattern = "(排名|排序|名次|评标结果|评审结果)"
  234. entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
  235. height = len(inner_table)
  236. text = ""
  237. for index in range(0,len(head_list),2):
  238. head_begin = head_list[index]
  239. head_end = head_list[index+1]
  240. direct = getDirect(inner_table, head_begin, head_end)
  241. if direct=="row":
  242. #行表头
  243. has_row_head = False
  244. for i in range(head_begin,head_end+1):
  245. width = len(inner_table[i])
  246. rank_text = ""
  247. entity_text = ""
  248. text_line = ""
  249. #在同一句话中重复的可以去掉
  250. text_set = set()
  251. for j in range(width):
  252. cell = inner_table[i][j]
  253. #是属性值
  254. if cell[1]==0:
  255. head = ""
  256. find_flag = False
  257. temp_head = ""
  258. for loop_j in range(1,j+1):
  259. if inner_table[i][j-loop_j][1]==2:
  260. if find_flag:
  261. if inner_table[i][j-loop_j][0]!=temp_head:
  262. head = inner_table[i][j-loop_j][0]+":"+head
  263. else:
  264. head = inner_table[i][j-loop_j][0]+":"+head
  265. find_flag = True
  266. temp_head = inner_table[i][j-loop_j][0]
  267. else:
  268. if find_flag:
  269. break
  270. find_flag = False
  271. temp_head = ""
  272. if i > 0:
  273. for loop_i in range(i-1, head_begin-1, -1): # 修改为从开始位置往上找
  274. if inner_table[loop_i][j][1]==1:
  275. if find_flag:
  276. if inner_table[loop_i][j][0]!=temp_head:
  277. head = inner_table[loop_i][j][0]+":"+head
  278. else:
  279. head = inner_table[loop_i][j][0]+":"+head
  280. find_flag = True
  281. temp_head = inner_table[loop_i][j][0]
  282. else:
  283. #找到表头后遇到属性值就返回
  284. if find_flag:
  285. break
  286. if str(head+inner_table[i][j][0]) in text_set:
  287. continue
  288. if re.search(rankPattern,head) is not None:
  289. rank_text += head+inner_table[i][j][0]+","
  290. #print(rank_text)
  291. elif re.search(entityPattern,head) is not None:
  292. entity_text += head+inner_table[i][j][0]+","
  293. #print(entity_text)
  294. else:
  295. text_line += head+inner_table[i][j][0]+","
  296. text_set.add(str(head+inner_table[i][j][0]))
  297. text += rank_text+entity_text+text_line
  298. #print(re.sub('\s', '', rank_text+entity_text+text_line))
  299. text = text[:-1]+"," if len(re.sub('\s', '', rank_text+entity_text+text_line)) < 20 else text[:-1]+"。" # 修改为短句加逗号
  300. #print(text)
  301. else:
  302. #列表头
  303. has_row_head = False
  304. width = len(inner_table[head_begin])
  305. for j in range(width):
  306. #for i in range(head_begin,head_end+1):
  307. #width = len(inner_table[i])
  308. rank_text = ""
  309. entity_text = ""
  310. text_line = ""
  311. #在同一句话中重复的可以去掉
  312. text_set = set()
  313. for i in range(head_begin,head_end+1):
  314. #for j in range(width):
  315. cell = inner_table[i][j]
  316. #是属性值
  317. if cell[1]==0:
  318. head = ""
  319. find_flag = False
  320. temp_head = ""
  321. if i > 0:
  322. for loop_i in range(i-1, head_begin-1, -1): # 修改为从开始位置往上找
  323. if inner_table[loop_i][j][1]==1:
  324. if find_flag:
  325. if inner_table[loop_i][j][0]!=temp_head:
  326. head = inner_table[loop_i][j][0]+":"+head
  327. else:
  328. head = inner_table[loop_i][j][0]+":"+head
  329. find_flag = True
  330. temp_head = inner_table[loop_i][j][0]
  331. else:
  332. #找到表头后遇到属性值就返回
  333. if find_flag:
  334. break
  335. find_flag = False
  336. temp_head = ""
  337. for loop_j in range(1,j+1): # 找列表头
  338. if inner_table[i][j-loop_j][1]==2:
  339. if find_flag:
  340. if inner_table[i][j-loop_j][0]!=temp_head:
  341. head = inner_table[i][j-loop_j][0]+":"+head
  342. else:
  343. head = inner_table[i][j-loop_j][0]+":"+head
  344. find_flag = True
  345. temp_head = inner_table[i][j-loop_j][0]
  346. else:
  347. if find_flag:
  348. break
  349. if str(head+inner_table[i][j][0]) in text_set:
  350. continue
  351. if re.search(rankPattern,head) is not None:
  352. rank_text += head+inner_table[i][j][0]+","
  353. #print(rank_text)
  354. elif re.search(entityPattern,head) is not None:
  355. entity_text += head+inner_table[i][j][0]+","
  356. #print(entity_text)
  357. else:
  358. text_line += head+inner_table[i][j][0]+","
  359. text_set.add(str(head+inner_table[i][j][0]))
  360. text += rank_text+entity_text+text_line
  361. #print(re.sub('\s', '', rank_text+entity_text+text_line))
  362. text = text[:-1]+"," if len(re.sub('\s', '', rank_text+entity_text+text_line)) < 20 else text[:-1]+"。" # 修改为短句加逗号
  363. return text
  364. def trunTable(tbody):
  365. inner_table, colspan, rowspan = getTable(tbody)
  366. inner_table, colspan, rowspan = fix_rowspan(inner_table, colspan, rowspan)
  367. if len(inner_table)>0 and len(inner_table[0])>0:
  368. inner_table,head_list,area_end_index = setHead(inner_table)
  369. tbody.string = getTableText(inner_table,head_list,area_end_index)
  370. #print(tbody.string)
  371. tbody.name = "table"
  372. return inner_table # 临时修改,保存中间变量
  373. pat_head = re.compile('(名称|序号|项目|标项|工程|品目[一二三四1234]|第[一二三四1234](标段|名|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理|制造)')
  374. #pat_head = re.compile('(名称|序号|项目|工程|品目[一二三四1234]|第[一二三四1234](标段|候选人|中标)|包段|包号|货物|单位|数量|价格|报价|金额|总价|单价|[招投中]标|供应商|候选|编号|得分|评委|评分|名次|排名|排序|科室|方式|工期|时间|产品|开始|结束|联系|日期|面积|姓名|证号|备注|级别|地[点址]|类型|代理)')
  375. pat_value = re.compile("(\d{2,}.\d{1}|\d+年\d+月|\d{8,}|\d{3,}-\d{6,}|有限[责任]*公司|^\d+$)")
  376. tbodies = soup.find_all('table')
  377. # 遍历表格中的每个tbody
  378. tables = [] # 临时添加
  379. #逆序处理嵌套表格
  380. for tbody_index in range(1,len(tbodies)+1):
  381. tbody = tbodies[len(tbodies)-tbody_index]
  382. #trunTable(tbody)
  383. inner_table = trunTable(tbody) # 临时修改,保存中间变量
  384. tables.append(inner_table)
  385. tbodies = soup.find_all('tbody')
  386. # 遍历表格中的每个tbody
  387. #逆序处理嵌套表格
  388. for tbody_index in range(1,len(tbodies)+1):
  389. tbody = tbodies[len(tbodies)-tbody_index]
  390. #trunTable(tbody)
  391. inner_table = trunTable(tbody) # 临时修改,保存中间变量
  392. tables.append(inner_table)
  393. #return soup
  394. return soup, tables # 临时修改
  395. #数据清洗
  396. def segment(soup):
  397. segList = ["title"]
  398. commaList = ["p","div","h1", "h2", "h3", "h4", "h5", "h6", "header", "dl", "ul", "label"]
  399. spaceList = ["span"]
  400. tbodies = soup.find_all('tbody')
  401. if len(tbodies) == 0:
  402. tbodies = soup.find_all('table')
  403. # 递归遍历所有节点,插入符号
  404. for child in soup.find_all(recursive=True):
  405. if child.name == 'br':
  406. child.insert_before(',')
  407. child_text = re.sub('\s', '', child.get_text())
  408. if child_text == '' or child_text[-1] in ['。',',',':',';']:
  409. continue
  410. if child.name in segList:
  411. child.insert_after("。")
  412. if child.name in commaList:
  413. if len(child_text)>3 and len(child_text) <50: # 先判断是否字数少于50,成立加逗号,否则加句号
  414. child.insert_after(",")
  415. elif len(child_text) >=50:
  416. child.insert_after("。")
  417. #if child.name in spaceList:
  418. #child.insert_after(" ")
  419. text = str(soup.get_text())
  420. #替换"""为"“",否则导入deepdive出错
  421. text = text.replace('"',"“")
  422. #text = text.replace('"',"“").replace("\r","").replace("\n","")
  423. #删除所有空格
  424. text = re.sub("\s+","#nbsp#",text)
  425. text_list = text.split('#nbsp#')
  426. new_text = ''
  427. for i in range(len(text_list)-1):
  428. if text_list[i] == '' or text_list[i][-1] in [',','。',';',':']:
  429. new_text += text_list[i]
  430. elif re.findall('([一二三四五六七八九]、)', text_list[i+1][:4]) != []:
  431. new_text += text_list[i] + '。'
  432. elif re.findall('([0-9]、)', text_list[i+1][:4]) != []:
  433. new_text += text_list[i] + ';'
  434. elif text_list[i].isdigit() and text_list[i+1].isdigit():
  435. new_text += text_list[i] + ' '
  436. elif text_list[i][-1] in ['-',':','(',')','/','(',')','——','年','月','日','时','分','¥'] or text_list[i+1][0] in ['-',':','(',')','/','(',')','——','年','月','日','时','分','元','万元']:
  437. new_text += text_list[i]
  438. elif len(text_list[i]) >= 3 and len(text_list[i+1]) >= 3:
  439. new_text += text_list[i] + ','
  440. else:
  441. new_text += text_list[i]
  442. new_text += text_list[-1]
  443. text = new_text
  444. #替换英文冒号为中文冒号
  445. text = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])",":",text)
  446. #替换为中文逗号
  447. text = re.sub("(?<=[\u4e00-\u9fa5]),|,(?=[\u4e00-\u9fa5])",",",text)
  448. #替换为中文分号
  449. text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
  450. #替换标点
  451. while(True):
  452. #替换连续的标点
  453. punc = re.search(",(?P<punc>:|。|,|;)\s*",text)
  454. if punc is not None:
  455. text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text)
  456. punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
  457. if punc is not None:
  458. text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
  459. else:
  460. #替换标点之后的空格
  461. punc = re.search("(?P<punc>:|。|,|;)\s+",text)
  462. if punc is not None:
  463. text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
  464. else:
  465. break
  466. #将连续的中文句号替换为一个
  467. text_split = text.split("。")
  468. text_split = [x for x in text_split if len(x)>0]
  469. text = "。".join(text_split)
  470. return text
  471. def union_ner(list_ner):
  472. result_list = []
  473. union_index = []
  474. union_index_set = set()
  475. for i in range(len(list_ner)-1):
  476. if len(set([str(list_ner[i][2]),str(list_ner[i+1][2])])&set(["org","company"]))==2:
  477. if list_ner[i][1]-list_ner[i+1][0]==1:
  478. union_index_set.add(i)
  479. union_index_set.add(i+1)
  480. union_index.append((i,i+1))
  481. for i in range(len(list_ner)):
  482. if i not in union_index_set:
  483. result_list.append(list_ner[i])
  484. for item in union_index:
  485. #print(str(list_ner[item[0]][3])+str(list_ner[item[1]][3]))
  486. result_list.append((list_ner[item[0]][0],list_ner[item[1]][1],'company',str(list_ner[item[0]][3])+str(list_ner[item[1]][3])))
  487. return result_list
  488. def getTokensAndNers(sentences,MAXAREA = 100000):
  489. '''
  490. @param: sentences:句子数
  491. @return 限流执行后的分词和实体识别list
  492. '''
  493. def getData(tokens,ners,process_data):
  494. process_sentences = [item[1] for item in process_data]
  495. token_ = fool.cut(process_sentences)
  496. ner_ = fool.ner(process_sentences)
  497. for i in range(len(token_)):
  498. the_index = process_data[i][0]
  499. tokens[the_index] = token_[i]
  500. ners[the_index] = ner_[i]
  501. sents = []
  502. for i in range(len(sentences)):
  503. sents.append([i,sentences[i]])
  504. sents.sort(key=lambda x:len(x[1]),reverse=True)
  505. index_ = 0
  506. tokens = [[]for i in range(len(sentences))]
  507. ners = [[]for i in range(len(sentences))]
  508. while(True):
  509. width = len(sents[index_][1])
  510. height = MAXAREA//width+1
  511. if height>len(sents)-index_:
  512. height = len(sents)-index_
  513. process_data = sents[index_:index_+height]
  514. getData(tokens, ners, process_data)
  515. index_ += height
  516. if index_>=len(sents):
  517. break
  518. return tokens,ners
  519. def get_articles_processed(articles):
  520. '''
  521. @summary:预处理步骤,NLP处理、实体识别
  522. @param:
  523. articles:待处理的文章list
  524. @return:list of articles,list of each article of sentences,list of each article of entitys
  525. '''
  526. list_articles = []
  527. list_sentences = []
  528. list_entitys = []
  529. for article in articles:
  530. list_sentences_temp = []
  531. list_entitys_temp = []
  532. doc_id = article[0]
  533. #表格处理
  534. #article_processed = segment(tableToText(BeautifulSoup(article[1],"lxml")))
  535. soup, tables = tableToText(BeautifulSoup(article[1],"lxml"))
  536. article_processed = segment(soup)
  537. #list_articles.append([doc_id,article_processed, article[1],tables]) # 临时修改,保存比较处理前和处理后结果
  538. #return list_articles # 临时修改,保存比较处理前和处理后结果
  539. list_articles.append(Article(doc_id,article_processed))
  540. #nlp处理
  541. if article_processed is not None and len(article_processed)!=0:
  542. split_patten = "。"
  543. sentences = re.split(split_patten,article_processed)
  544. sentences = [x for x in sentences if len(x)!=0]
  545. lemmas = []
  546. doc_offsets = []
  547. dep_types = []
  548. dep_tokens = []
  549. time1 = time.time()
  550. '''
  551. tokens_all = fool.cut(sentences)
  552. #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
  553. #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
  554. ner_entitys_all = fool.ner(sentences)
  555. '''
  556. #限流执行
  557. tokens_all,ner_entitys_all = getTokensAndNers(sentences)
  558. print("nlp:",time.time()-time1)
  559. for sentence_index in range(len(sentences)):
  560. list_sentence_entitys = []
  561. sentence_text = sentences[sentence_index]
  562. tokens = tokens_all[sentence_index]
  563. list_tokenbegin = []
  564. begin = 0
  565. for i in range(0,len(tokens)):
  566. list_tokenbegin.append(begin)
  567. begin += len(str(tokens[i]))
  568. list_tokenbegin.append(begin+1)
  569. #pos_tag = pos_all[sentence_index]
  570. pos_tag = ""
  571. ner_entitys = ner_entitys_all[sentence_index]
  572. list_sentences_temp.append(Sentences(doc_id=doc_id,sentence_index=sentence_index,sentence_text=sentence_text,tokens=tokens,pos_tags=pos_tag,ner_tags=ner_entitys))
  573. #识别实体
  574. for ner_entity in ner_entitys:
  575. begin_index_temp = ner_entity[0]
  576. entity_type = ner_entity[2]
  577. entity_text = ner_entity[3]
  578. for j in range(len(list_tokenbegin)):
  579. if list_tokenbegin[j]==begin_index_temp:
  580. begin_index = j
  581. break
  582. elif list_tokenbegin[j]>begin_index_temp:
  583. begin_index = j-1
  584. break
  585. begin_index_temp += len(str(entity_text))
  586. for j in range(begin_index,len(list_tokenbegin)):
  587. if list_tokenbegin[j]>=begin_index_temp:
  588. end_index = j-1
  589. break
  590. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  591. list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index))
  592. #使用正则识别金额
  593. entity_type = "money"
  594. #money_patten_str = "(([1-9][\d,,]*(?:\.\d+)?[百千万亿]?[\(\)()元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[¥¥]+,?|报价|标价)[(\(]?([万])?元?[)\)]?[::]?.{,7}?([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?)|([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]?)[\((]?([万元]{1,2}))*"
  595. list_money_pattern = {"cn":"(()()([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})())*",
  596. "key_word":"((?:[¥¥]+,?|报价|标价)(?:[(\(]?\s*([万元]*)\s*[)\)]?)\s*[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分]{,7}?)([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())*",
  597. "front_m":"((?:[(\(]?\s*([万元]+)\s*[)\)])\s*[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分]{,7}?)([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())*",
  598. "behind_m":"(()()([1-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?([万元]+)[\))]?)*"}
  599. set_begin = set()
  600. for pattern_key in list_money_pattern.keys():
  601. pattern = re.compile(list_money_pattern[pattern_key])
  602. all_match = re.findall(pattern, sentence_text)
  603. index = 0
  604. for i in range(len(all_match)):
  605. if len(all_match[i][0])>0:
  606. #print(all_match[i][0])
  607. unit = ""
  608. entity_text = all_match[i][3]
  609. if pattern_key in ["key_word","front_m"]:
  610. unit = all_match[i][1]
  611. else:
  612. unit = all_match[i][4]
  613. if entity_text.find("元")>=0:
  614. unit = ""
  615. index += len(all_match[i][0])-len(entity_text)-len(all_match[i][4])#-len(all_match[i][1])-len(all_match[i][2])#整个提出来的作为实体->数字部分作为整体,否则会丢失特征
  616. for j in range(len(list_tokenbegin)):
  617. if list_tokenbegin[j]==index:
  618. begin_index = j
  619. break
  620. elif list_tokenbegin[j]>index:
  621. begin_index = j-1
  622. break
  623. index += len(str(entity_text))+len(all_match[i][4])#+len(all_match[i][2])+len(all_match[i][1])#整个提出来的作为实体
  624. #index += len(str(all_match[i][0]))
  625. for j in range(begin_index,len(list_tokenbegin)):
  626. if list_tokenbegin[j]>=index:
  627. end_index = j-1
  628. break
  629. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  630. entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]","",entity_text)
  631. if len(unit)>0:
  632. entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0]))
  633. else:
  634. entity_text = str(getUnifyMoney(entity_text))
  635. list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index))
  636. else:
  637. index += 1
  638. list_sentence_entitys.sort(key=lambda x:x.begin_index)
  639. list_entitys_temp = list_entitys_temp+list_sentence_entitys
  640. list_sentences.append(list_sentences_temp)
  641. list_entitys.append(list_entitys_temp)
  642. return list_articles,list_sentences,list_entitys
  643. def union_result(codeName,prem):
  644. '''
  645. @summary:模型的结果拼成字典
  646. @param:
  647. codeName:编号名称模型的结果字典
  648. prem:拿到属性的角色的字典
  649. @return:拼接起来的字典
  650. '''
  651. result = []
  652. assert len(codeName)==len(prem)
  653. for item_code,item_prem in zip(codeName,prem):
  654. if item_code[0]==item_prem[0]:
  655. result.append([item_code[0],dict(item_code[1],**item_prem[1])])
  656. return result
  657. def persistenceData(data):
  658. '''
  659. @summary:将中间结果保存到数据库-线上生产的时候不需要执行
  660. '''
  661. import psycopg2
  662. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  663. cursor = conn.cursor()
  664. for item_index in range(len(data)):
  665. item = data[item_index]
  666. doc_id = item[0]
  667. dic = item[1]
  668. code = dic['code']
  669. name = dic['name']
  670. prem = dic['prem']
  671. if len(code)==0:
  672. code_insert = ""
  673. else:
  674. code_insert = ";".join(code)
  675. prem_insert = ""
  676. for item in prem:
  677. for x in item:
  678. if isinstance(x, list):
  679. if len(x)>0:
  680. for x1 in x:
  681. prem_insert+="/".join(x1)+","
  682. prem_insert+="$"
  683. else:
  684. prem_insert+=str(x)+"$"
  685. prem_insert+=";"
  686. sql = " insert into predict_validation(doc_id,code,name,prem) values('"+doc_id+"','"+code_insert+"','"+name+"','"+prem_insert+"')"
  687. cursor.execute(sql)
  688. conn.commit()
  689. conn.close()
  690. def persistenceData1(list_entitys,list_sentences):
  691. '''
  692. @summary:将中间结果保存到数据库-线上生产的时候不需要执行
  693. '''
  694. import psycopg2
  695. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  696. cursor = conn.cursor()
  697. for list_entity in list_entitys:
  698. for entity in list_entity:
  699. if entity.values is not None:
  700. sql = " insert into predict_entity(entity_id,entity_text,entity_type,doc_id,sentence_index,begin_index,end_index,label,values) values('"+str(entity.entity_id)+"','"+str(entity.entity_text)+"','"+str(entity.entity_type)+"','"+str(entity.doc_id)+"',"+str(entity.sentence_index)+","+str(entity.begin_index)+","+str(entity.end_index)+","+str(entity.label)+",array"+str(entity.values)+")"
  701. else:
  702. sql = " insert into predict_entity(entity_id,entity_text,entity_type,doc_id,sentence_index,begin_index,end_index) values('"+str(entity.entity_id)+"','"+str(entity.entity_text)+"','"+str(entity.entity_type)+"','"+str(entity.doc_id)+"',"+str(entity.sentence_index)+","+str(entity.begin_index)+","+str(entity.end_index)+")"
  703. cursor.execute(sql)
  704. for list_sentence in list_sentences:
  705. for sentence in list_sentence:
  706. str_tokens = "["
  707. for item in sentence.tokens:
  708. str_tokens += "'"
  709. if item=="'":
  710. str_tokens += "''"
  711. else:
  712. str_tokens += item
  713. str_tokens += "',"
  714. str_tokens = str_tokens[:-1]+"]"
  715. sql = " insert into predict_sentences(doc_id,sentence_index,tokens) values('"+sentence.doc_id+"',"+str(sentence.sentence_index)+",array"+str_tokens+")"
  716. cursor.execute(sql)
  717. conn.commit()
  718. conn.close()
  719. if __name__=="__main__":
  720. import glob
  721. import re
  722. #files = glob.glob( 'F:/工作文档/实体识别实体对其//20190320/*.html')
  723. #files = glob.glob( 'F:/工作文档/实体识别实体对其//20190306/*.html')
  724. #files = glob.glob( 'F:/工作文档/实体识别实体对其//20190513/*.html')
  725. #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58466066.html')
  726. #files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58447523.html')
  727. #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58511386.html')
  728. #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58521609.html')
  729. #files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58502967.html') # 内容缺失
  730. #files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58445908.html') # 把采购人、单位名称识别为表头
  731. #files = glob.glob('F:/工作文档/实体识别实体对其/20190416要素\\比地_101_61320687.html') #行表头识别不到
  732. #files = glob.glob('F:/工作文档/实体识别实体对其/20190306\\比地_52_57131306.html') # span 空格区分不了
  733. #files = glob.glob('F:/工作文档/实体识别实体对其/20190320/比地_101_58522893.html') # 某行tr没有td
  734. files = glob.glob('F:/工作文档/实体识别实体对其/20190320\\比地_101_58447523.html')
  735. #files = glob.glob('F:/工作文档/实体识别实体对其/test/*.html')
  736. #files = ['F:/工作文档/实体识别实体对其/1.html']
  737. print(len(files))
  738. i = 0
  739. filePaths =[]
  740. for file in files:
  741. with open(file, 'r', encoding='utf-8') as f:
  742. content = f.read()
  743. filePaths.append([file, content])
  744. #tables = re.findall('<table[^<].*?</table>', re.sub('\s','',content))
  745. ##if len(tables) == 0 and re.search('采购人', content) != None and re.search('代理机构',content) != None and re.search('&nbsp;',content) != None:
  746. ##filePaths.append([file, content])
  747. #for table in tables:
  748. #if re.search('排序', table) != None or re.search('名次',table) != None\
  749. #or re.search('排名', table) != None:
  750. ##if re.search('colspan', table) != None and re.search('rowspan',table) != None and re.search('第一中标', table) != None:
  751. #filePaths.append([file, content])
  752. #break
  753. list_articles = get_articles_processed(filePaths)
  754. #list_articles,list_sentences,list_entitys = get_articles_processed(filePaths)
  755. with open('F:/工作文档/实体识别实体对其/20190416要素/list_articles_20190306.pkl', 'wb') as f:
  756. pickle.dump(list_articles, f)
  757. print(len(list_articles))
  758. #doc_id = "09067598-7076-11e8-9dae-52540087e52f"
  759. #import psycopg2
  760. #conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  761. #cursor = conn.cursor()
  762. #sql = " select id,content from articles where id='"+doc_id+"' "
  763. #cursor.execute(sql)
  764. #ContentIDs = cursor.fetchall()
  765. #list_articles,list_sentences,list_entitys = get_articles_processed(ContentIDs)
  766. #for i in range(len(list_entitys)):
  767. #for j in range(len(list_entitys[i])):
  768. #entity = list_entitys[i][j]
  769. #sentence = list_sentences[i][entity.sentence_index]
  770. #tokens = sentence.tokens
  771. #begin_index = entity.begin_index
  772. #end_index = entity.end_index
  773. #if entity.entity_type in ['org','company']:
  774. #item_x = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1])
  775. ##print(item_x)