1 éve · bd9e682295
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -353,14 +353,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     getAttributes.confirm_prem(prem[0]['prem'], channel_dic)
			
 
				 
			
 
				     # 提取拟在建所需字段
			
 
				-    # start_time = time.time()
			
 
				-    # pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
			
 
				-    # cost_time["pb_extract"] = round(time.time() - start_time, 2)
			
 
				+    start_time = time.time()
			
 
				+    pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
			
 
				+    cost_time["pb_extract"] = round(time.time() - start_time, 2)
			
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				     version_date = {'version_date': '2024-04-30'}
			
 
				-    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
			
 
				+    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
			
 
				 
			
 
				     '''最终检查修正招标、中标金额'''
			
 
				     getAttributes.limit_maximum_amount(data_res, list_entitys[0])
			
--- a/BiddingKG/dl/proposed_building/pb_extract.py
+++ b/BiddingKG/dl/proposed_building/pb_extract.py
@@ -876,8 +876,6 @@ def extract_several_money(list_sentence, html='', is_obj=True):
 
				 
			
 
				 
			
 
				 def extract_max_floor(content, html=None):
			
 
				-    from bs4 import BeautifulSoup
			
 
				-
			
 
				     def match_floor(_reg, _content, _reg2=None, _tables_and_divs=None):
			
 
				         _match = re.finditer(_reg, _content)
			
 
				         _floor_list = []
			
@@ -977,8 +975,6 @@ def extract_max_floor(content, html=None):
 
				 
			
 
				 
			
 
				 def extract_structure(content, html=None, structure_keyword_list=None):
			
 
				-    from bs4 import BeautifulSoup
			
 
				-
			
 
				     # reg = '框架结构|钢框架结构|混凝土框架结构|剪力墙结构|框架-剪力墙结构|框架+剪力墙结构|框架和剪力墙结构|框架及剪力墙结构|混凝土剪力墙结构|筒体结构|桅式结构|墙板结构|膜结构|悬索结构|板柱结构|充气结构|网架结构|壳体结构|拱形结构|穹顶结构|混凝土结构|钢筋混凝土框架结构|钢筋混凝土筒仓结构|钢结构|砌体结构|木结构|砖混结构|排架结构|束筒结构|薄壳结构|钢混结构|砖木结构|砌体结构|钢砼结构|框剪结构|钢筋混凝土框架结构|筒中筒结构|框筒结构|桁架结构|拱券结构|钢筋混凝土结构|框架核心筒结构|门式钢架结构|门钢结构|轻钢结构|钢-混凝土框架结构|木框架结构|空间网格结构|框架筒体结构|砖拱结构|钢筋砼结构|核心筒结构|框架-核心筒结构'
			
 
				     reg1 = '(结构(楼层|)(形式|类型|类别|体系|结构)[为是：:])([^结]{2,8}结构)'
			
 
				 
			
@@ -1187,17 +1183,20 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				         is_yeji = 0
			
 
				         reg_yeji = '业绩|选取原因|奖项|获奖|供应商信息|近年完成|中标单位信息|评标情况|类似项目|资质|候选人情况'
			
 
				         for index3, obj in enumerate(_tables_and_divs):
			
 
				-            if '<tr' not in str(obj):
			
 
				+            # if '<tr' not in str(obj):
			
 
				+            if obj.find('tr'):
			
 
				                 continue
			
 
				 
			
 
				             _table = obj
			
 
				-            _m = re.search(re.escape(_entity_text), str(_table))
			
 
				+            _table_text = str(_table.get_text())
			
 
				+            # _m = re.search(re.escape(_entity_text), str(_table))
			
 
				+            _m = re.search(re.escape(_entity_text), _table_text)
			
 
				             if not _m:
			
 
				                 _entity_text = _entity_text[1:]
			
 
				-                _m = re.search(re.escape(_entity_text[1:]), str(_table))
			
 
				+                _m = re.search(re.escape(_entity_text[1:]), _table_text)
			
 
				             if not _m:
			
 
				                 _entity_text = _entity_text[:-1]
			
 
				-                _m = re.search(re.escape(_entity_text[:-1]), str(_table))
			
 
				+                _m = re.search(re.escape(_entity_text[:-1]), _table_text)
			
 
				             # print('_entity_text', _entity_text)
			
 
				             if _m:
			
 
				                 rows = _table.find_all('tr')
			
@@ -1209,14 +1208,15 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				                 # print('max_col_span', max_col_span)
			
 
				 
			
 
				                 for index, row in enumerate(rows):
			
 
				-                    if re.search(re.escape(_entity_text), str(row)):
			
 
				+                    if re.search(re.escape(_entity_text), str(row.get_text())):
			
 
				                         cells = row.find_all('td')
			
 
				                         if len(cells) == 0:
			
 
				                             continue
			
 
				 
			
 
				                         # print('cells', cells[0])
			
 
				                         # 第一列是否含业绩
			
 
				-                        if re.search(reg_yeji, str(cells[0])):
			
 
				+                        # if re.search(reg_yeji, str(cells[0])):
			
 
				+                        if re.search(reg_yeji, str(cells[0].get_text())):
			
 
				                             # logging.info('is_yeji_table 1')
			
 
				                             is_yeji = 1
			
 
				                         # 前面几行是否有占多列的有业绩
			
@@ -1226,14 +1226,14 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				                                     # print('len(rows[index2])', len(row2.find_all('td')))
			
 
				                                     # if len(row2.find_all('td')) <= max_col_span / 2:
			
 
				                                     #     print(re.search('业绩', str(row2)), str(row2))
			
 
				-                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2)):
			
 
				+                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2.get_text())):
			
 
				                                         # logging.info('is_yeji_table 2')
			
 
				                                         is_yeji = 1
			
 
				 
			
 
				                         break
			
 
				 
			
 
				                 # 前面都找不到，那么找表格上方的两行
			
 
				-                div_list = [str(x) for x in _tables_and_divs[max(0, index3-2):index3]]
			
 
				+                div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3-2):index3]]
			
 
				                 if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
			
 
				                     # logging.info('is_yeji_table 3')
			
 
				                     is_yeji = 1
			
@@ -1261,8 +1261,6 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
				 
			
 
				 
			
 
				 def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
			
 
				-    from bs4 import BeautifulSoup
			
 
				-
			
 
				     def getDigitsDic(_unit):
			
 
				         '''
			
 
				         @summary:拿到中文对应的数字
			
@@ -1336,53 +1334,6 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
 
				             return Decimal(0)
			
 
				         return result
			
 
				 
			
 
				-    def is_yeji_table(_tables_and_divs, _entity_text):
			
 
				-        if not _tables_and_divs:
			
 
				-            return 0
			
 
				-
			
 
				-        is_yeji = 0
			
 
				-        reg_yeji = '业绩|选取原因|奖项|获奖'
			
 
				-        for index3, obj in enumerate(_tables_and_divs):
			
 
				-            if '<tr' not in str(obj):
			
 
				-                continue
			
 
				-
			
 
				-            _table = obj
			
 
				-            _m = re.search(re.escape(_entity_text), str(_table))
			
 
				-            if _m:
			
 
				-                rows = _table.find_all('tr')
			
 
				-
			
 
				-                max_col_span = 0
			
 
				-                for row in rows:
			
 
				-                    if len(row.find_all('td')) > max_col_span:
			
 
				-                        max_col_span = len(row.find_all('td'))
			
 
				-                # print('max_col_span', max_col_span)
			
 
				-
			
 
				-                for index, row in enumerate(rows):
			
 
				-                    if re.search(re.escape(_entity_text), str(row)):
			
 
				-                        cells = row.find_all('td')
			
 
				-                        # 第一列是否含业绩
			
 
				-                        if re.search(re.escape(_entity_text), str(cells[0])):
			
 
				-                            is_yeji = 1
			
 
				-                        # 前面几行是否有占多列的有业绩
			
 
				-                        else:
			
 
				-                            if index > 0:
			
 
				-                                for row2 in rows[:index][::-1]:
			
 
				-                                    # print('len(rows[index2])', len(row2.find_all('td')))
			
 
				-                                    # if len(row2.find_all('td')) <= max_col_span / 2:
			
 
				-                                    #     print(re.search('业绩', str(row2)), str(row2))
			
 
				-                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2)):
			
 
				-                                        is_yeji = 1
			
 
				-
			
 
				-                        break
			
 
				-
			
 
				-                # 前面都找不到，那么找表格上方的两行
			
 
				-                if not is_yeji and re.search(reg_yeji, ' '.join(_tables_and_divs[index3-2:index3])):
			
 
				-                    is_yeji = 1
			
 
				-                break
			
 
				-        # print('is_yeji', is_yeji)
			
 
				-        return is_yeji
			
 
				-
			
 
				-
			
 
				     # 提取表格用于判断业绩
			
 
				     if html:
			
 
				         soup = BeautifulSoup(html, 'lxml')