Explorar el Código

表格处理超时修复

znj hace 3 años
padre
commit
c33054f7a4

+ 1 - 1
BiddingKG/dl/entityLink/entityLink.py

@@ -121,7 +121,7 @@ def link_entitys(list_entitys,on_value=0.81):
                             _entity.entity_text = _ent.entity_text
                             used_linked_entitys.append(_ent)
                             # print(_entity.entity_text, _entity.if_dict_match, _ent.entity_text, _ent.if_dict_match)
-
+# 用于去重的标题
 def doctitle_refine(doctitle):
     _doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|'
                              r'交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|竞价|合同', '', doctitle)

+ 74 - 36
BiddingKG/dl/interface/Preprocessing.py

@@ -384,34 +384,38 @@ def tableToText(soup):
         set_item = set()
         height = len(inner_table)
         width = len(inner_table[0])
+        empty_set = set()
         for i in range(height):
             for j in range(width):
                 item = inner_table[i][j][0]
-                set_item.add(item)
+                if item.strip()=="":
+                    empty_set.add(item)
+                else:
+                    set_item.add(item)
         list_item = list(set_item)
-        x = []
-        for item in list_item:
-            x.append(getPredictor("form").encode(item))
-        predict_y = getPredictor("form").predict(np.array(x),type="item")
-        _dict = dict()
-        
-        for item,values in zip(list_item,list(predict_y)):
-            _dict[item] = values[1]
-            # print("##",item,values)
-        #print(_dict)
-        for i in range(height):
-            for j in range(width):
-                item = inner_table[i][j][0]
-                inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
+        if list_item:
+            x = []
+            for item in list_item:
+                x.append(getPredictor("form").encode(item))
+            predict_y = getPredictor("form").predict(np.array(x),type="item")
+            _dict = dict()
+
+            for item,values in zip(list_item,list(predict_y)):
+                _dict[item] = values[1]
+                # print("##",item,values)
+            #print(_dict)
+            for i in range(height):
+                for j in range(width):
+                    item = inner_table[i][j][0]
+                    if item not in empty_set:
+                        inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
 
         # print("=====")
         # for item in inner_table:
         #     print(item)
         # print("======")
-
         repairTable(inner_table)
         head_list = sliceTable(inner_table)
-
         
         return inner_table,head_list
 
@@ -985,15 +989,28 @@ def tableToText(soup):
                 if inner_table[h][w][0]==fix_value:
                     inner_table[h][w][0] = ""
     
-    def trunTable(tbody):
+    def trunTable(tbody,in_attachment):
+        # print(tbody.find('tbody'))
+        # 附件中的表格,排除异常错乱的表格
+        if in_attachment:
+            if tbody.name=='table':
+                _tbody = tbody.find('tbody')
+            else:
+                _tbody = tbody
+            _td_len_list = []
+            for _tr in _tbody.find_all(recursive=False):
+                len_td = len(_tr.find_all(recursive=False))
+                _td_len_list.append(len_td)
+            if len(list(set(_td_len_list)))>8:
+                return None
         fixSpan(tbody)
         inner_table = getTable(tbody)
         inner_table = fixTable(inner_table)
         if len(inner_table)>0 and len(inner_table[0])>0:
             #inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
             #inner_table,head_list = setHead_inline(inner_table)
-            # inner_table, head_list = setHead_initem(inner_table,pat_head)
-            inner_table, head_list = set_head_model(inner_table)
+            inner_table, head_list = setHead_initem(inner_table,pat_head)
+            # inner_table, head_list = set_head_model(inner_table)
             # inner_table,head_list = setHead_incontext(inner_table,pat_head)
             # print(inner_table)
             # for begin in range(len(head_list[:-1])):
@@ -1033,20 +1050,36 @@ def tableToText(soup):
                 ul.get_text(), re.S)))>3:
             ul.extract()
 
-    tbodies = soup.find_all('table')
+    # tbodies = soup.find_all('table')
     # 遍历表格中的每个tbody
+    tbodies = []
+    in_attachment = False
+    for _part in soup.find_all():
+        if _part.name=='table':
+            tbodies.append((_part,in_attachment))
+        elif _part.name=='div':
+            if 'class' in _part.attrs and "richTextFetch" in _part['class']:
+                in_attachment = True
     #逆序处理嵌套表格
     for tbody_index in range(1,len(tbodies)+1):
-        tbody = tbodies[len(tbodies)-tbody_index]
-        inner_table = trunTable(tbody)
+        tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
+        inner_table = trunTable(tbody,_in_attachment)
         list_innerTable.append(inner_table)
 
-    tbodies = soup.find_all('tbody')
+    # tbodies = soup.find_all('tbody')
     # 遍历表格中的每个tbody
+    tbodies = []
+    in_attachment = False
+    for _part in soup.find_all():
+        if _part.name == 'tbody':
+            tbodies.append((_part, in_attachment))
+        elif _part.name == 'div':
+            if 'class' in _part.attrs and "richTextFetch" in _part['class']:
+                in_attachment = True
     #逆序处理嵌套表格
     for tbody_index in range(1,len(tbodies)+1):
-        tbody = tbodies[len(tbodies)-tbody_index]
-        inner_table = trunTable(tbody)
+        tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
+        inner_table = trunTable(tbody,_in_attachment)
         list_innerTable.append(inner_table)
 
     return soup
@@ -1785,15 +1818,20 @@ def article_limit(soup,limit_words=30000):
             while n_soup:
                 text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
         if len(_text_split[1])>limit_words:
-            attachment_text_nums = 0
-            attachment_skip = False
-            for part in attachment_part.find_all(recursive=False):
-                if not attachment_skip:
-                    attachment_text_nums += len(re.sub(sub_space, "", part.get_text()))
-                    if attachment_text_nums>=limit_words:
-                        attachment_skip = True
-                else:
-                    part.decompose()
+            # attachment_html纯文本,无子结构
+            if len(attachment_part.find_all(recursive=False))==0:
+                attachment_part.string = str(attachment_part.get_text())[:limit_words]
+            else:
+                attachment_text_nums = 0
+                attachment_skip = False
+                for part in attachment_part.find_all(recursive=False):
+                    if not attachment_skip:
+                        attachment_text_nums += len(re.sub(sub_space, "", part.get_text()))
+                        if attachment_text_nums>=limit_words:
+                            part.string = str(part.get_text())[:attachment_text_nums-limit_words]
+                            attachment_skip = True
+                    else:
+                        part.decompose()
 
     return soup
 
@@ -1843,8 +1881,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         # 正文和附件内容限制字数30000
         article_processed = article_limit(article_processed,limit_words=30000)
         article_processed = get_preprocessed_outline(article_processed)
+        # print('article_processed')
         article_processed = tableToText(article_processed)
-        # print(article_processed)
         article_processed = segment(article_processed)
         article_processed = article_processed.replace('.','.') # 2021/12/01 修正OCR识别PDF小数点错误问题
         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改

+ 68 - 0
BiddingKG/dl/interface/extract.py

@@ -42,6 +42,70 @@ class MyEncoder(json.JSONEncoder):
             return obj
         return json.JSONEncoder.default(self, obj)
 
+def extractCount(extract_dict):
+    # time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
+
+    if len(extract_dict):
+        _extract = extract_dict
+    else:
+        _extract = {}
+    print(_extract)
+    dict_pack = _extract.get("prem",{})
+    extract_count = 0
+    list_code = _extract.get("code",[])
+    if len(list_code)>0:
+        project_code = list_code[0]
+    else:
+        project_code = ""
+    project_name = _extract.get("name","")
+    bidding_budget = ""
+    win_tenderer = ""
+    win_bid_price = ""
+    for _key in dict_pack.keys():
+        if "tendereeMoney" in dict_pack[_key] and dict_pack[_key]["tendereeMoney"]!='' and float(dict_pack[_key]["tendereeMoney"])>0:
+            extract_count += 1
+            if bidding_budget=="":
+                bidding_budget = str(float(dict_pack[_key]["tendereeMoney"]))
+        for _role in dict_pack[_key]["roleList"]:
+            if isinstance(_role,list):
+                extract_count += 1
+                if _role[2]!='' and float(_role[2])>0:
+                    extract_count += 1
+                if _role[0]=="tenderee":
+                    tenderee = _role[1]
+                if _role[0]=="win_tenderer":
+                    if  win_tenderer=="":
+                        win_tenderer = _role[1]
+                    if _role[2]!='' and float(_role[2])>0:
+                        extract_count += 1
+                        if win_bid_price=="":
+                            win_bid_price = str(float(_role[2]))
+                if _role[0]=="agency":
+                    agency = _role[1]
+            if isinstance(_role,dict):
+                extract_count += 1
+                if "role_money" in _role:
+                    if str(_role["role_money"].get("money",""))!='' and float(_role["role_money"].get("money",""))>0:
+                        extract_count += 1
+                if _role.get("role_name")=="tenderee":
+                    tenderee = _role["role_text"]
+                if _role.get("role_name")=="win_tenderer":
+                    if  win_tenderer=="":
+                        win_tenderer = _role["role_text"]
+                    if "role_money" in _role:
+                        if str(_role["role_money"]["money"])!='' and float(_role["role_money"]["money"])>0:
+                            extract_count += 1
+                            if win_bid_price=="":
+                                win_bid_price = str(float(_role["role_money"]["money"]))
+                if _role["role_name"]=="agency":
+                    agency = _role["role_text"]
+
+    if project_code!="":
+        extract_count += 1
+    if project_name!="":
+        extract_count += 1
+    return extract_count
+
 def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
     cost_time = dict()
 
@@ -169,6 +233,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',**kwargs):
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
+    # 要素的个数
+    data_res['extract_count'] = extractCount(data_res)
+    # 是否有表格
+    data_res['exist_table'] = 1 if re.search("<td",text) else 0
     data_res["cost_time"] = cost_time
     data_res["success"] = True
 

+ 0 - 3
BiddingKG/dl/interface/getAttributes.py

@@ -1026,7 +1026,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
         last_tokens_num = len(sentence.tokens)
     attribute_type = ['money','serviceTime','ratio']# 'money'仅指“中投标金额”
-    # print([i.entity_text for i in list_entity if i.entity_type=='money'])
     for link_attribute in attribute_type:
         temp_entity_list = []
         if link_attribute=="money":
@@ -1045,7 +1044,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 drop_tendererMoney.append(next_entity)
             for _drop in drop_tendererMoney:
                 temp_entity_list.remove(_drop)
-            # print([i.entity_text for i in temp_entity_list])
         elif link_attribute=="serviceTime":
             temp_entity_list = [ent for ent in list_entity if (ent.entity_type in ['org','company'] and ent.label in [2,3,4]) or
                                 ent.entity_type=='serviceTime']
@@ -1104,7 +1102,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
         # km算法分配求解
         dispatch_result = dispatch(temp_match_list)
         dispatch_result = sorted(dispatch_result, key=lambda x: (x[0].sentence_index,x[0].begin_index))
-        # print(dispatch_result)
         for match in dispatch_result:
             _entity = match[0]
             _attribute = match[1]