|
@@ -2567,9 +2567,9 @@ class ProductAttributesPredictor():
|
|
|
col1_l = []
|
|
|
for tds in inner_table:
|
|
|
if len(tds) == 2:
|
|
|
- col0_l.append(re.sub('[::]', '', tds[0]))
|
|
|
+ col0_l.append(re.sub('[::]', '', tds[0])) # 处理只有两列的情况
|
|
|
col1_l.append(tds[1])
|
|
|
- elif len(tds)>=4 and len(inner_table)==2:
|
|
|
+ elif len(tds)>=4 and len(inner_table)==2: # 处理只有两行的情况
|
|
|
col0_l = inner_table[0]
|
|
|
col1_l = inner_table[1]
|
|
|
break
|
|
@@ -2616,7 +2616,7 @@ class ProductAttributesPredictor():
|
|
|
while i < (len(inner_table)):
|
|
|
tds = inner_table[i]
|
|
|
not_empty = [it for it in tds if it != ""]
|
|
|
- if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
|
|
|
+ if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2: # 一半列是空的或者小于两列的 继续
|
|
|
i += 1
|
|
|
continue
|
|
|
product = "" # 产品
|
|
@@ -2634,6 +2634,8 @@ class ProductAttributesPredictor():
|
|
|
if len(set([re.sub('[::]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
|
|
|
# if len(set(tds) & self.header_set) > len(tds) * 0.2:
|
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
|
|
|
+ if found_header:
|
|
|
+ header_colnum = len(tds) # 保存表头所在行列数
|
|
|
if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
|
|
|
quantity_header = header_list[1].replace('单位:', '')
|
|
|
if re.search('(([\w/]{,5}))', quantity_header):
|
|
@@ -2644,7 +2646,6 @@ class ProductAttributesPredictor():
|
|
|
if found_header and len(headers)<1: # 只保留出现的第一个表头
|
|
|
headers.append('_'.join(header_list))
|
|
|
headers_demand.append('_'.join(header_list2))
|
|
|
- header_colnum = len(tds)
|
|
|
header_col.append('_'.join(tds))
|
|
|
i += 1
|
|
|
continue
|
|
@@ -2662,6 +2663,18 @@ class ProductAttributesPredictor():
|
|
|
id6 = header_dic.get('需求', "")
|
|
|
id7 = header_dic.get('预算', "")
|
|
|
id8 = header_dic.get('时间', "")
|
|
|
+
|
|
|
+ not_attr = 0
|
|
|
+ for k, v in header_dic.items():
|
|
|
+ if isinstance(v, int):
|
|
|
+ if v >= len(tds) or tds[v] in self.header_set:
|
|
|
+ not_attr = 1
|
|
|
+ break
|
|
|
+ if not_attr: # 只要属性里面有一项为表头,停止匹配
|
|
|
+ i += 1
|
|
|
+ found_header = False
|
|
|
+ continue
|
|
|
+
|
|
|
if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
|
|
|
re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
|
|
|
product = tds[id1]
|
|
@@ -4679,6 +4692,8 @@ class TablePremExtractor(object):
|
|
|
if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) - set(['', ' ']) == set(): # 全部为空 停止匹配
|
|
|
break
|
|
|
|
|
|
+ if re.search('详见', project_name): # 去除某些表达: 详见招标文件
|
|
|
+ project_name = ""
|
|
|
if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
|
|
|
package_code_raw = project_name
|
|
|
project_name = ""
|