há 1 ano atrás · bd9e682295
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -353,14 +353,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
															     getAttributes.confirm_prem(prem[0]['prem'], channel_dic)
														
 
															     # 提取拟在建所需字段
														
 
															-    # start_time = time.time()
														
 
															-    # pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
														
 
															-    # cost_time["pb_extract"] = round(time.time() - start_time, 2)
														
 
															+    start_time = time.time()
														
 
															+    pb_json = predictor.getPredictor('pb_extract').predict(prem, list_articles, list_sentences, list_entitys, title, codeName[0], text)
														
 
															+    cost_time["pb_extract"] = round(time.time() - start_time, 2)
														
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
														
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
														
 
															     version_date = {'version_date': '2024-04-30'}
														
 
															-    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
														
 
															+    data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
														
 
															     '''最终检查修正招标、中标金额'''
														
 
															     getAttributes.limit_maximum_amount(data_res, list_entitys[0])
														
--- a/BiddingKG/dl/proposed_building/pb_extract.py
+++ b/BiddingKG/dl/proposed_building/pb_extract.py
@@ -876,8 +876,6 @@ def extract_several_money(list_sentence, html='', is_obj=True):
 
															 def extract_max_floor(content, html=None):
														
 
															-    from bs4 import BeautifulSoup
														
 
															-
														
 
															     def match_floor(_reg, _content, _reg2=None, _tables_and_divs=None):
														
 
															         _match = re.finditer(_reg, _content)
														
 
															         _floor_list = []
														
@@ -977,8 +975,6 @@ def extract_max_floor(content, html=None):
 
															 def extract_structure(content, html=None, structure_keyword_list=None):
														
 
															-    from bs4 import BeautifulSoup
														
 
															-
														
 
															     # reg = '框架结构|钢框架结构|混凝土框架结构|剪力墙结构|框架-剪力墙结构|框架+剪力墙结构|框架和剪力墙结构|框架及剪力墙结构|混凝土剪力墙结构|筒体结构|桅式结构|墙板结构|膜结构|悬索结构|板柱结构|充气结构|网架结构|壳体结构|拱形结构|穹顶结构|混凝土结构|钢筋混凝土框架结构|钢筋混凝土筒仓结构|钢结构|砌体结构|木结构|砖混结构|排架结构|束筒结构|薄壳结构|钢混结构|砖木结构|砌体结构|钢砼结构|框剪结构|钢筋混凝土框架结构|筒中筒结构|框筒结构|桁架结构|拱券结构|钢筋混凝土结构|框架核心筒结构|门式钢架结构|门钢结构|轻钢结构|钢-混凝土框架结构|木框架结构|空间网格结构|框架筒体结构|砖拱结构|钢筋砼结构|核心筒结构|框架-核心筒结构'
														
 
															     reg1 = '(结构(楼层|)(形式|类型|类别|体系|结构)[为是：:])([^结]{2,8}结构)'
														
@@ -1187,17 +1183,20 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
															         is_yeji = 0
														
 
															         reg_yeji = '业绩|选取原因|奖项|获奖|供应商信息|近年完成|中标单位信息|评标情况|类似项目|资质|候选人情况'
														
 
															         for index3, obj in enumerate(_tables_and_divs):
														
 
															-            if '<tr' not in str(obj):
														
 
															+            # if '<tr' not in str(obj):
														
 
															+            if obj.find('tr'):
														
 
															                 continue
														
 
															             _table = obj
														
 
															-            _m = re.search(re.escape(_entity_text), str(_table))
														
 
															+            _table_text = str(_table.get_text())
														
 
															+            # _m = re.search(re.escape(_entity_text), str(_table))
														
 
															+            _m = re.search(re.escape(_entity_text), _table_text)
														
 
															             if not _m:
														
 
															                 _entity_text = _entity_text[1:]
														
 
															-                _m = re.search(re.escape(_entity_text[1:]), str(_table))
														
 
															+                _m = re.search(re.escape(_entity_text[1:]), _table_text)
														
 
															             if not _m:
														
 
															                 _entity_text = _entity_text[:-1]
														
 
															-                _m = re.search(re.escape(_entity_text[:-1]), str(_table))
														
 
															+                _m = re.search(re.escape(_entity_text[:-1]), _table_text)
														
 
															             # print('_entity_text', _entity_text)
														
 
															             if _m:
														
 
															                 rows = _table.find_all('tr')
														
@@ -1209,14 +1208,15 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
															                 # print('max_col_span', max_col_span)
														
 
															                 for index, row in enumerate(rows):
														
 
															-                    if re.search(re.escape(_entity_text), str(row)):
														
 
															+                    if re.search(re.escape(_entity_text), str(row.get_text())):
														
 
															                         cells = row.find_all('td')
														
 
															                         if len(cells) == 0:
														
 
															                             continue
														
 
															                         # print('cells', cells[0])
														
 
															                         # 第一列是否含业绩
														
 
															-                        if re.search(reg_yeji, str(cells[0])):
														
 
															+                        # if re.search(reg_yeji, str(cells[0])):
														
 
															+                        if re.search(reg_yeji, str(cells[0].get_text())):
														
 
															                             # logging.info('is_yeji_table 1')
														
 
															                             is_yeji = 1
														
 
															                         # 前面几行是否有占多列的有业绩
														
@@ -1226,14 +1226,14 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
															                                     # print('len(rows[index2])', len(row2.find_all('td')))
														
 
															                                     # if len(row2.find_all('td')) <= max_col_span / 2:
														
 
															                                     #     print(re.search('业绩', str(row2)), str(row2))
														
 
															-                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2)):
														
 
															+                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2.get_text())):
														
 
															                                         # logging.info('is_yeji_table 2')
														
 
															                                         is_yeji = 1
														
 
															                         break
														
 
															                 # 前面都找不到，那么找表格上方的两行
														
 
															-                div_list = [str(x) for x in _tables_and_divs[max(0, index3-2):index3]]
														
 
															+                div_list = [str(x.get_text()) for x in _tables_and_divs[max(0, index3-2):index3]]
														
 
															                 if not is_yeji and re.search(reg_yeji, ' '.join(div_list)):
														
 
															                     # logging.info('is_yeji_table 3')
														
 
															                     is_yeji = 1
														
@@ -1261,8 +1261,6 @@ def judge_yeji(end_index, content, judge_len=300, tables_and_divs=None, entity=N
 
															 def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
														
 
															-    from bs4 import BeautifulSoup
														
 
															-
														
 
															     def getDigitsDic(_unit):
														
 
															         '''
														
 
															         @summary:拿到中文对应的数字
														
@@ -1336,53 +1334,6 @@ def get_several_money(sentence_text, found_yeji, in_attachment=False, html=''):
 
															             return Decimal(0)
														
 
															         return result
														
 
															-    def is_yeji_table(_tables_and_divs, _entity_text):
														
 
															-        if not _tables_and_divs:
														
 
															-            return 0
														
 
															-
														
 
															-        is_yeji = 0
														
 
															-        reg_yeji = '业绩|选取原因|奖项|获奖'
														
 
															-        for index3, obj in enumerate(_tables_and_divs):
														
 
															-            if '<tr' not in str(obj):
														
 
															-                continue
														
 
															-
														
 
															-            _table = obj
														
 
															-            _m = re.search(re.escape(_entity_text), str(_table))
														
 
															-            if _m:
														
 
															-                rows = _table.find_all('tr')
														
 
															-
														
 
															-                max_col_span = 0
														
 
															-                for row in rows:
														
 
															-                    if len(row.find_all('td')) > max_col_span:
														
 
															-                        max_col_span = len(row.find_all('td'))
														
 
															-                # print('max_col_span', max_col_span)
														
 
															-
														
 
															-                for index, row in enumerate(rows):
														
 
															-                    if re.search(re.escape(_entity_text), str(row)):
														
 
															-                        cells = row.find_all('td')
														
 
															-                        # 第一列是否含业绩
														
 
															-                        if re.search(re.escape(_entity_text), str(cells[0])):
														
 
															-                            is_yeji = 1
														
 
															-                        # 前面几行是否有占多列的有业绩
														
 
															-                        else:
														
 
															-                            if index > 0:
														
 
															-                                for row2 in rows[:index][::-1]:
														
 
															-                                    # print('len(rows[index2])', len(row2.find_all('td')))
														
 
															-                                    # if len(row2.find_all('td')) <= max_col_span / 2:
														
 
															-                                    #     print(re.search('业绩', str(row2)), str(row2))
														
 
															-                                    if len(row2.find_all('td')) <= max_col_span / 2 and re.search(reg_yeji, str(row2)):
														
 
															-                                        is_yeji = 1
														
 
															-
														
 
															-                        break
														
 
															-
														
 
															-                # 前面都找不到，那么找表格上方的两行
														
 
															-                if not is_yeji and re.search(reg_yeji, ' '.join(_tables_and_divs[index3-2:index3])):
														
 
															-                    is_yeji = 1
														
 
															-                break
														
 
															-        # print('is_yeji', is_yeji)
														
 
															-        return is_yeji
														
 
															-
														
 
															-
														
 
															     # 提取表格用于判断业绩
														
 
															     if html:
														
 
															         soup = BeautifulSoup(html, 'lxml')