|
@@ -876,8 +876,6 @@ def extract_several_money(list_sentence, html='', is_obj=True):
|
|
|
|
|
|
|
|
|
|
def extract_max_floor(content, html=None):
|
|
def extract_max_floor(content, html=None):
|
|
- from bs4 import BeautifulSoup
|
|
|
|
-
|
|
|
|
def match_floor(_reg, _content, _reg2=None, _tables_and_divs=None):
|
|
def match_floor(_reg, _content, _reg2=None, _tables_and_divs=None):
|
|
_match = re.finditer(_reg, _content)
|
|
_match = re.finditer(_reg, _content)
|
|
_floor_list = []
|
|
_floor_list = []
|
|
@@ -977,8 +975,6 @@ def extract_max_floor(content, html=None):
|
|
|
|
|
|
|
|
|
|
def extract_structure(content, html=None, structure_keyword_list=None):
|
|
def extract_structure(content, html=None, structure_keyword_list=None):
|
|
- from bs4 import BeautifulSoup
|
|
|
|
-
|
|
|
|
# reg = '框架结构|钢框架结构|混凝土框架结构|剪力墙结构|框架-剪力墙结构|框架+剪力墙结构|框架和剪力墙结构|框架及剪力墙结构|混凝土剪力墙结构|筒体结构|桅式结构|墙板结构|膜结构|悬索结构|板柱结构|充气结构|网架结构|壳体结构|拱形结构|穹顶结构|混凝土结构|钢筋混凝土框架结构|钢筋混凝土筒仓结构|钢结构|砌体结构|木结构|砖混结构|排架结构|束筒结构|薄壳结构|钢混结构|砖木结构|砌体结构|钢砼结构|框剪结构|钢筋混凝土框架结构|筒中筒结构|框筒结构|桁架结构|拱券结构|钢筋混凝土结构|框架核心筒结构|门式钢架结构|门钢结构|轻钢结构|钢-混凝土框架结构|木框架结构|空间网格结构|框架筒体结构|砖拱结构|钢筋砼结构|核心筒结构|框架-核心筒结构'
|
|
# reg = '框架结构|钢框架结构|混凝土框架结构|剪力墙结构|框架-剪力墙结构|框架+剪力墙结构|框架和剪力墙结构|框架及剪力墙结构|混凝土剪力墙结构|筒体结构|桅式结构|墙板结构|膜结构|悬索结构|板柱结构|充气结构|网架结构|壳体结构|拱形结构|穹顶结构|混凝土结构|钢筋混凝土框架结构|钢筋混凝土筒仓结构|钢结构|砌体结构|木结构|砖混结构|排架结构|束筒结构|薄壳结构|钢混结构|砖木结构|砌体结构|钢砼结构|框剪结构|钢筋混凝土框架结构|筒中筒结构|框筒结构|桁架结构|拱券结构|钢筋混凝土结构|框架核心筒结构|门式钢架结构|门钢结构|轻钢结构|钢-混凝土框架结构|木框架结构|空间网格结构|框架筒体结构|砖拱结构|钢筋砼结构|核心筒结构|框架-核心筒结构'
|
|
reg1 = '(结构(楼层|)(形式|类型|类别|体系|结构)[为是::])([^结]{2,8}结构)'
|
|
reg1 = '(结构(楼层|)(形式|类型|类别|体系|结构)[为是::])([^结]{2,8}结构)'
|
|
|
|
|
|
@@ -1187,17 +1183,20 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
is_yeji = 0
|
|
is_yeji = 0
|
|
reg_yeji = '业绩|选取原因|奖项|获奖|供应商信息|近年完成|中标单位信息|评标情况|类似项目|资质|候选人情况'
|
|
reg_yeji = '业绩|选取原因|奖项|获奖|供应商信息|近年完成|中标单位信息|评标情况|类似项目|资质|候选人情况'
|
|
for index3, obj in enumerate(_tables_and_divs):
|
|
for index3, obj in enumerate(_tables_and_divs):
|
|
- if '<tr' not in str(obj):
|
|
|
|
|
|
+ # if '<tr' not in str(obj):
|
|
|
|
+ if obj.find('tr'):
|
|
continue
|
|
continue
|
|
|
|
|
|
_table = obj
|
|
_table = obj
|
|
- _m = re.search(re.escape(_entity_text), str(_table))
|
|
|
|
|
|
+ _table_text = str(_table.get_text())
|
|
|
|
+ # _m = re.search(re.escape(_entity_text), str(_table))
|
|
|
|
+ _m = re.search(re.escape(_entity_text), _table_text)
|
|
if not _m:
|
|
if not _m:
|
|
_entity_text = _entity_text[1:]
|
|
_entity_text = _entity_text[1:]
|
|
- _m = re.search(re.escape(_entity_text[1:]), str(_table))
|
|
|
|
|
|
+ _m = re.search(re.escape(_entity_text[1:]), _table_text)
|
|
if not _m:
|
|
if not _m:
|
|
_entity_text = _entity_text[:-1]
|
|
_entity_text = _entity_text[:-1]
|
|
- _m = re.search(re.escape(_entity_text[:-1]), str(_table))
|
|
|
|
|
|
+ _m = re.search(re.escape(_entity_text[:-1]), _table_text)
|
|
# print('_entity_text', _entity_text)
|
|
# print('_entity_text', _entity_text)
|
|
if _m:
|
|
if _m:
|
|
rows = _table.find_all('tr')
|
|
rows = _table.find_all('tr')
|
|
@@ -1209,14 +1208,15 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
# print('max_col_span', max_col_span)
|
|
# print('max_col_span', max_col_span)
|
|
|
|
|
|
for index, row in enumerate(rows):
|
|
for index, row in enumerate(rows):
|
|
- if re.search(re.escape(_entity_text), str(row)):
|
|
|
|
|
|
+ if re.search(re.escape(_entity_text), str(row.get_text())):
|
|
cells = row.find_all('td')
|
|
cells = row.find_all('td')
|
|
if len(cells) == 0:
|
|
if len(cells) == 0:
|
|
continue
|
|
continue
|
|
|
|
|
|
# print('cells', cells[0])
|
|
# print('cells', cells[0])
|
|
# 第一列是否含业绩
|
|
# 第一列是否含业绩
|
|
- if re.search(reg_yeji, str(cells[0])):
|
|
|
|
|
|
+ # if re.search(reg_yeji, str(cells[0])):
|
|
|
|
+ if re.search(reg_yeji, str(cells[0].get_text())):
|
|
# logging.info('is_yeji_table 1')
|
|
# logging.info('is_yeji_table 1')
|
|
is_yeji = 1
|
|
is_yeji = 1
|
|
# 前面几行是否有占多列的有业绩
|
|
# 前面几行是否有占多列的有业绩
|
|
@@ -1226,14 +1226,14 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
# print('len(rows[index2])', len(row2.find_all('td')))
|
|
# print('len(rows[index2])', len(row2.find_all('td')))
|
|
# if len(row2.find_all('td')) <= max_col_span / 2:
|
|
# if len(row2.find_all('td')) <= max_col_span / 2:
|
|
# print(re.search('业绩', str(row2)), str(row2))
|
|
# print(re.search('业绩', str(row2)), str(row2))
|
|
- if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2)):
|
|
|
|
|
|
+ if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2.get_text())):
|
|
# logging.info('is_yeji_table 2')
|
|
# logging.info('is_yeji_table 2')
|
|
is_yeji = 1
|
|
is_yeji = 1
|
|
|
|
|
|
break
|
|
break
|
|
|
|
|
|
# 前面都找不到,那么找表格上方的两行
|
|
# 前面都找不到,那么找表格上方的两行
|
|
- div_list = [str(x) for x in _tables_and_divs[max(0, index3-2):index3]]
|
|
|
|
|
|
+ div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3-2):index3]]
|
|
if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
|
|
if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
|
|
# logging.info('is_yeji_table 3')
|
|
# logging.info('is_yeji_table 3')
|
|
is_yeji = 1
|
|
is_yeji = 1
|
|
@@ -1261,8 +1261,6 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
|
|
|
|
|
|
|
|
|
|
def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
|
|
def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
|
|
- from bs4 import BeautifulSoup
|
|
|
|
-
|
|
|
|
def getDigitsDic(_unit):
|
|
def getDigitsDic(_unit):
|
|
'''
|
|
'''
|
|
@summary:拿到中文对应的数字
|
|
@summary:拿到中文对应的数字
|
|
@@ -1336,53 +1334,6 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
|
|
return Decimal(0)
|
|
return Decimal(0)
|
|
return result
|
|
return result
|
|
|
|
|
|
- def is_yeji_table(_tables_and_divs, _entity_text):
|
|
|
|
- if not _tables_and_divs:
|
|
|
|
- return 0
|
|
|
|
-
|
|
|
|
- is_yeji = 0
|
|
|
|
- reg_yeji = '业绩|选取原因|奖项|获奖'
|
|
|
|
- for index3, obj in enumerate(_tables_and_divs):
|
|
|
|
- if '<tr' not in str(obj):
|
|
|
|
- continue
|
|
|
|
-
|
|
|
|
- _table = obj
|
|
|
|
- _m = re.search(re.escape(_entity_text), str(_table))
|
|
|
|
- if _m:
|
|
|
|
- rows = _table.find_all('tr')
|
|
|
|
-
|
|
|
|
- max_col_span = 0
|
|
|
|
- for row in rows:
|
|
|
|
- if len(row.find_all('td')) > max_col_span:
|
|
|
|
- max_col_span = len(row.find_all('td'))
|
|
|
|
- # print('max_col_span', max_col_span)
|
|
|
|
-
|
|
|
|
- for index, row in enumerate(rows):
|
|
|
|
- if re.search(re.escape(_entity_text), str(row)):
|
|
|
|
- cells = row.find_all('td')
|
|
|
|
- # 第一列是否含业绩
|
|
|
|
- if re.search(re.escape(_entity_text), str(cells[0])):
|
|
|
|
- is_yeji = 1
|
|
|
|
- # 前面几行是否有占多列的有业绩
|
|
|
|
- else:
|
|
|
|
- if index > 0:
|
|
|
|
- for row2 in rows[:index][::-1]:
|
|
|
|
- # print('len(rows[index2])', len(row2.find_all('td')))
|
|
|
|
- # if len(row2.find_all('td')) <= max_col_span / 2:
|
|
|
|
- # print(re.search('业绩', str(row2)), str(row2))
|
|
|
|
- if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2)):
|
|
|
|
- is_yeji = 1
|
|
|
|
-
|
|
|
|
- break
|
|
|
|
-
|
|
|
|
- # 前面都找不到,那么找表格上方的两行
|
|
|
|
- if not is_yeji and re.search(reg_yeji, ' '.join(_tables_and_divs[index3-2:index3])):
|
|
|
|
- is_yeji = 1
|
|
|
|
- break
|
|
|
|
- # print('is_yeji', is_yeji)
|
|
|
|
- return is_yeji
|
|
|
|
-
|
|
|
|
-
|
|
|
|
# 提取表格用于判断业绩
|
|
# 提取表格用于判断业绩
|
|
if html:
|
|
if html:
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
soup = BeautifulSoup(html, 'lxml')
|