Prechádzať zdrojové kódy

修复某些公告产品属性报错

lsm 2 rokov pred
rodič
commit
9c0f857fa1

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -254,7 +254,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2022-12-08'}
+    version_date = {'version_date': '2022-12-13'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise

+ 19 - 4
BiddingKG/dl/interface/predictor.py

@@ -2567,9 +2567,9 @@ class ProductAttributesPredictor():
                 col1_l = []
                 for tds in inner_table:
                     if len(tds) == 2:
-                        col0_l.append(re.sub('[::]', '', tds[0]))
+                        col0_l.append(re.sub('[::]', '', tds[0]))  # 处理只有两列的情况
                         col1_l.append(tds[1])
-                    elif len(tds)>=4 and len(inner_table)==2:
+                    elif len(tds)>=4 and len(inner_table)==2:  # 处理只有两行的情况
                         col0_l = inner_table[0]
                         col1_l = inner_table[1]
                         break
@@ -2616,7 +2616,7 @@ class ProductAttributesPredictor():
             while i < (len(inner_table)):
                 tds = inner_table[i]
                 not_empty = [it for it in tds if it != ""]
-                if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
+                if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2: # 一半列是空的或者小于两列的 继续
                     i += 1
                     continue
                 product = ""  # 产品
@@ -2634,6 +2634,8 @@ class ProductAttributesPredictor():
                 if len(set([re.sub('[::]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
                 # if len(set(tds) & self.header_set) > len(tds) * 0.2:
                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
+                    if found_header:
+                        header_colnum = len(tds) # 保存表头所在行列数
                     if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
                             quantity_header = header_list[1].replace('单位:', '')
                             if re.search('(([\w/]{,5}))', quantity_header):
@@ -2644,7 +2646,6 @@ class ProductAttributesPredictor():
                     if found_header and len(headers)<1:  # 只保留出现的第一个表头
                         headers.append('_'.join(header_list))
                         headers_demand.append('_'.join(header_list2))
-                        header_colnum = len(tds)
                         header_col.append('_'.join(tds))
                     i += 1
                     continue
@@ -2662,6 +2663,18 @@ class ProductAttributesPredictor():
                     id6 = header_dic.get('需求', "")
                     id7 = header_dic.get('预算', "")
                     id8 = header_dic.get('时间', "")
+
+                    not_attr = 0
+                    for k, v in header_dic.items():
+                        if isinstance(v, int):
+                            if v >= len(tds) or tds[v] in self.header_set:
+                                not_attr = 1
+                                break
+                    if not_attr: # 只要属性里面有一项为表头,停止匹配
+                        i += 1
+                        found_header = False
+                        continue
+
                     if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
                             re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
                         product = tds[id1]
@@ -4679,6 +4692,8 @@ class TablePremExtractor(object):
             if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) - set(['', ' ']) == set():  # 全部为空 停止匹配
                 break
 
+            if re.search('详见', project_name):  # 去除某些表达: 详见招标文件
+                project_name = ""
             if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
                 package_code_raw = project_name
                 project_name = ""