Sfoglia il codice sorgente

产品属性增加总价及参数、优化提取

lsm 1 anno fa
parent
commit
97128ea86d
1 ha cambiato i file con 148 aggiunte e 81 eliminazioni
  1. 148 81
      BiddingKG/dl/interface/predictor.py

+ 148 - 81
BiddingKG/dl/interface/predictor.py

@@ -2536,8 +2536,11 @@ class ProductPredictor():
 # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
 class ProductAttributesPredictor():
     def __init__(self,):
-        self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
-        self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
+        self.p0 = '(品目|类别|类型|物类|目录|^品名|^品类)(名称|$)'
+        self.p1 = '(标的|标项|项目|计划|标段|[分子]?包|子目|维修|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|服务|工程|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|招标|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名)[\))的]?(名称|内容|描述)'
+        self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称'
+        # self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
+        # self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
             self.header_set = pickle.load(f)
     def isTrueTable(self, table):
@@ -2580,7 +2583,8 @@ class ProductAttributesPredictor():
             if len(tds) < 2:
                 continue
             for td in tds:
-                td_text = re.sub('\s+|…', ' ', td.get_text()).strip()
+                # td_text = re.sub('\s+|…', ' ', td.get_text()).strip()
+                td_text = re.sub('…', '', td.get_text()).strip()
                 td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/').replace('"', '') # 修复272144312 # 产品单价数量提取结果有特殊符号\  气动执行装置备件\密封组件\NBR+PT
                 td_text = td_text.replace("(", "(").replace(")", ")").replace(':', ':')
                 tr_line.append(td_text)
@@ -2806,7 +2810,7 @@ class ProductAttributesPredictor():
             quantity_unit = ""
         return quantity, quantity_unit
 
-    def find_header(self, items, p1, p2):
+    def find_header(self, items,p0, p1, p2):
         '''
         inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
         :param items: 列表,内容为每个td 文本内容
@@ -2815,7 +2819,7 @@ class ProductAttributesPredictor():
         :return: 表头所在列序号,是否表头,表头内容
         '''
         flag = False
-        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
+        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': ''}
         product = ""  # 产品
         quantity = ""  # 数量
         quantity_unit = "" # 数量单位
@@ -2825,25 +2829,36 @@ class ProductAttributesPredictor():
         demand = "" # 采购需求
         budget = "" # 预算金额
         order_time = "" # 采购时间
+        total_price = "" # 总价
+        category = "" # 品目
+        parameter = "" # 参数
 
-        for i in range(min(4, len(items))):
+        for i in range(min(6, len(items))):
             it = items[i]
-            if len(it) < 15 and re.search(p1, it) != None:
+            if len(it) < 15 and re.search(p0, it) != None:
+                flag = True
+                category = it
+                header_dic['品目'] = i
+            elif len(it) < 15 and re.search(p1, it) != None:
                 flag = True
                 product = it
                 header_dic['名称'] = i
                 break
-        if not flag:
+        # if not flag:
+        if product == "":
             for i in range(min(4, len(items))):
                 it = items[i]
-                if len(it) < 15 and re.search(p2, it) and re.search(
+                if len(it) < 15 and it != category and re.search(p2, it) and re.search(
                         '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None:
                     flag = True
                     product = it
                     header_dic['名称'] = i
                     break
         if flag:
-            for j in range(i + 1, len(items)):
+            # for j in range(i + 1, len(items)):
+            for j in range(len(items)):
+                if items[j] in [product, category]:
+                    continue
                 if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
                     continue
                 if header_dic['数量']=="" and re.search('数量|采购量', items[j]) and re.search('单价|用途|要求|规格|型号|运输|承运', items[j])==None:
@@ -2861,6 +2876,9 @@ class ProductAttributesPredictor():
                 elif re.search('规格|型号', items[j]):
                     header_dic['规格'] = j
                     specs = items[j]
+                elif re.search('参数', items[j]):
+                    header_dic['参数'] = j
+                    parameter = items[j]
 
                 elif re.search('需求|服务要求|服务标准', items[j]):
                     header_dic['需求'] = j
@@ -2871,16 +2889,21 @@ class ProductAttributesPredictor():
                 elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
                     header_dic['时间'] = j
                     order_time = items[j]
-
-            if header_dic.get('名称', "") != "" :
-                num = 0
-                for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
-                    if it != "":
-                        num  += 1
-                if num >=2:
-                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
+                elif re.search('总价|^金额|(成交|中标|验收|合同|预算|控制|总|合计))?(金额|价格?)', items[j]):
+                    header_dic['总价'] = j
+                    total_price = items[j]
+
+            if header_dic.get('名称', "") != "" or header_dic.get('品目', "") != "":
+                # num = 0
+                # for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time, total_price):
+                #     if it != "":
+                #         num  += 1
+                # if num >=2:
+                #     return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
+                if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
+                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
         flag = False
-        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
+        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
 
     def predict(self, docid='', html='', page_time=""):
         '''
@@ -2899,6 +2922,7 @@ class ProductAttributesPredictor():
         header_col = []
         product_link = []
         demand_link = []
+        product_set = set()
         total_product_money = 0
         for i in range(len(tables)-1, -1, -1):
             table = tables[i]
@@ -2983,10 +3007,11 @@ class ProductAttributesPredictor():
                 order_time = ""  # 采购时间
                 order_begin = ""
                 order_end = ""
-                # print(tds,set(tds) & self.header_set)
+                total_price = "" # 总金额
+                parameter = "" # 参数
                 if len(set([re.sub('[::]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
                 # if len(set(tds) & self.header_set) > len(tds) * 0.2:
-                    header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
+                    header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
                     if found_header:
                         header_colnum = len(tds) # 保存表头所在行列数
                     if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
@@ -3006,6 +3031,7 @@ class ProductAttributesPredictor():
                     if len(tds) != header_colnum:  # 表头、属性列数不一致跳过
                         i += 1
                         continue
+                    id0 = header_dic.get('品目', "")
                     id1 = header_dic.get('名称', "")
                     id2 = header_dic.get('数量', "")
                     id2_2 = header_dic.get('单位', "")
@@ -3017,6 +3043,9 @@ class ProductAttributesPredictor():
                     id7 = header_dic.get('预算', "")
                     id8 = header_dic.get('时间', "")
 
+                    id9 = header_dic.get("总价", "")
+                    id10 = header_dic.get('参数', "")
+
                     not_attr = 0
                     for k, v in header_dic.items():
                         if isinstance(v, int):
@@ -3028,48 +3057,39 @@ class ProductAttributesPredictor():
                         found_header = False
                         continue
 
-                    if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
+                    if id1!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
                             re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
                         product = tds[id1]
+
+                    if id0!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id0]) and tds[id0] not in self.header_set and \
+                            re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id0]) == None:
+                        category = tds[id0]
+                        product = "%s_%s"%(category, product) if product!="" else category
+
+                    if product != "":
                         if id2 != "":
                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
                                 quantity = tds[id2]
-                                # quantity = re.sub('[()(),,约]', '', quantity)
-                                # quantity = re.sub('[一壹]', '1', quantity)
-                                # ser = re.search('^(\d+\.?\d*)([㎡\w/]{,5})', quantity)
-                                # if ser:
-                                #     quantity = str(ser.group(1))
-                                #     quantity_unit = ser.group(2)
-                                #     if quantity_unit == "" and header_quan_unit != "":
-                                #         quantity_unit = header_quan_unit
-                                # else:
-                                #     quantity = ""
-                                #     quantity_unit = ""
                         if id2_2 != "":
-                            if re.search('^\w{1,4}$', tds[id2_2]):
+                            if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None:
                                 quantity_unit = tds[id2_2]
                         if id3 != "":
                             if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
                                 unitPrice = tds[id3]
-                            elif re.search('^[\d,.亿万元人民币欧美日金额:()()]+$', tds[id3].strip()):
+                            elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id3].strip()):
                                 unitPrice = tds[id3]
-                                # _unitPrice = tds[id3]
-                                # re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
-                                # if re_price:
-                                #     _unitPrice = re_price[0]
-                                #     if '万元' in header_list[2] and '万' not in _unitPrice:
-                                #         _unitPrice += '万元'
-                                #     # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
-                                #     unitPrice = str(getUnifyMoney(_unitPrice))
-
                         if id4 != "":
                             if re.search('\w', tds[id4]):
                                 brand = tds[id4]
+                                if re.match('^详见|^略$', brand.strip()):
+                                    brand = ""
                             else:
                                 brand = ""
                         if id5 != "":
                             if re.search('\w', tds[id5]):
                                 specs = tds[id5][:500] # 限制最多500字
+                                if re.match('^详见|^略$', specs.strip()):
+                                    specs = ""
                             else:
                                 specs = ""
                         if id6 != "":
@@ -3080,46 +3100,69 @@ class ProductAttributesPredictor():
                         if id7 != "":
                             if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
                                 budget = tds[id7]
-                                # _budget = tds[id7]
-                                # re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", _budget)
-                                # if re_price:
-                                #     _budget = re_price[0]
-                                #     if '万元' in header_list2[2] and '万' not in _budget:
-                                #         _budget += '万元'
-                                #     budget = str(getUnifyMoney(_budget))
                         if id8 != "":
                             if re.search('\w', tds[id8]):
                                 order_time = tds[id8].strip()
                                 order_begin, order_end = self.fix_time(order_time, html, page_time)
-                        if quantity != "" or unitPrice != "" or brand != "" or specs != "":
-                            if quantity != "":
-                                quantity, quantity_unit = self.fix_quantity(quantity, header_quan_unit)
-                            if unitPrice != "":
-                                unitPrice, _money_unit = money_process(unitPrice, header_list[2])
-                                unitPrice = str(unitPrice) if unitPrice != 0 else ""
-                            if budget != "":
-                                budget, _money_unit = money_process(budget, header_list2[2])
-                                budget = str(budget) if budget != 0 else ''
-
-                            if id2 != "" and id3 != "" and len(re.split('[,,\s]', tds[id2])) > 1 and len(re.split('[,,\s]', tds[id1])) == len(re.split('[,,\s]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
-                                products = re.split('[,,\s]', tds[id1])
-                                quantitys = re.split('[,,\s]', tds[id2])
-                                unitPrices = re.split('[,,\s]', tds[id3])
-                                brands = re.split('[,,\s]', brand)
-                                specses = re.split('[,,\s]', specs)
+                        if id9 != "":
+                            if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id9]):
+                                total_price = tds[id9]
+                            elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id9].strip()):
+                                total_price = tds[id9]
+                        if id10 != "":
+                            parameter = tds[id10][:500]
+                            if re.match('^详见|^略$', parameter.strip()):
+                                parameter = ""
+                        if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
+                            if id2 != "" and id3 != "" and len(re.split('[;;、,\n]', tds[id2])) > 1 and len(re.split('[;;、,\n]', tds[id1])) == len(re.split('[;;、,\n]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
+                                products = re.split('[;;、,\n]', tds[id1])
+                                quantitys = re.split('[;;、,\n]', tds[id2])
+                                unitPrices = re.split('[;;、,\n]', tds[id3])
+                                total_prices = re.split('[;;、,\n]', total_price)
+                                brands = re.split('[;;、,\n]', brand) if re.search('等$', brand)==None else [brand]
+                                specses = re.split('[;;、,\n]', specs) if re.search('等$', specs)==None else [specs]
+                                parameters = re.split('[;;、,\n]', parameter) if re.search('等$', parameter)==None else [parameter]
+                                unitPrices = [""]*len(products) if len(unitPrices)==1 else unitPrices
+                                total_prices = [""]*len(products) if len(total_prices)==1 else total_prices
+                                brands = brands*len(products) if len(brands)==1 else brands
+                                specses = specses*len(products) if len(specses)==1 else specses
+                                parameters = parameters*len(products) if len(parameters)==1 else parameters
                                 if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(specses):
-                                    for product, quantity, unitPrice, brand, specs in zip(products,quantitys,unitPrices, brands, specses):
+                                    for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(products,quantitys,unitPrices, brands, specses, total_prices, parameters):
+                                        if quantity != "":
+                                            quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
+                                            quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
+                                        if unitPrice != "":
+                                            unitPrice, _money_unit = money_process(unitPrice, header_list[3])
+                                            unitPrice = str(unitPrice) if unitPrice != 0 else ""
+                                        if budget != "":
+                                            budget, _money_unit = money_process(budget, header_list2[2])
+                                            budget = str(budget) if budget != 0 else ''
+                                        if total_price != "":
+                                            total_price, _money_unit = money_process(total_price, header_list[6])
+                                            total_price = str(total_price) if unitPrice != 0 else ""
                                         link = {'product': product, 'quantity': quantity,
                                                 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
-                                                'brand': brand[:50], 'specs': specs}
-                                        if link not in product_link:
+                                                'brand': brand[:50], 'specs': specs, 'total_price': total_price, 'parameter': parameter}
+                                        # if link not in product_link:
+                                        #     product_link.append(link)
+                                        #     mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
+                                        #     if link['unitPrice'] != "" and mat:
+                                        #         try:
+                                        #             total_product_money += float(link['unitPrice']) * float(
+                                        #                 mat.group(1).replace(',', '')) if float(
+                                        #                 mat.group(1).replace(',', '')) < 50000 else 0
+                                        #         except:
+                                        #             log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
+                                        #             link['unitPrice'], link['quantity']))
+
+                                        if (product, specs, unitPrice, quantity) not in product_set:
+                                            product_set.add((product, specs, unitPrice, quantity))
                                             product_link.append(link)
-                                            mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
-                                            if link['unitPrice'] != "" and mat:
+                                            if link['unitPrice'] != "" and link['quantity'] != '':
                                                 try:
                                                     total_product_money += float(link['unitPrice']) * float(
-                                                        mat.group(1).replace(',', '')) if float(
-                                                        mat.group(1).replace(',', '')) < 50000 else 0
+                                                        link['quantity']) if float(link['quantity']) < 50000 else 0
                                                 except:
                                                     log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
                                                     link['unitPrice'], link['quantity']))
@@ -3128,16 +3171,40 @@ class ProductAttributesPredictor():
                                 i += 1
                                 continue
                             else:
+                                if quantity != "":
+                                    quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
+                                    quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
+                                if unitPrice != "":
+                                    unitPrice, _money_unit = money_process(unitPrice, header_list[3])
+                                    unitPrice = str(unitPrice) if unitPrice != 0 else ""
+                                if budget != "":
+                                    budget, _money_unit = money_process(budget, header_list2[2])
+                                    budget = str(budget) if budget != 0 else ''
+                                if total_price != "":
+                                    total_price, _money_unit = money_process(total_price, header_list[6])
+                                    total_price = str(total_price) if unitPrice != 0 else ""
                                 link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
-                                                          'brand': brand[:50], 'specs':specs}
-                                if link not in product_link:
+                                                          'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter}
+
+                                # if link not in product_link:
+                                #     product_link.append(link)
+                                #     mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
+                                #     if link['unitPrice'] != "" and mat:
+                                #         try:
+                                #             total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', '')) if float(mat.group(1).replace(',', ''))<50000 else 0
+                                #         except:
+                                #             log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
+
+                                if (product, specs, unitPrice, quantity) not in product_set:
+                                    product_set.add((product, specs, unitPrice, quantity))
                                     product_link.append(link)
-                                    mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
-                                    if link['unitPrice'] != "" and mat:
+                                    if link['unitPrice'] != "" and link['quantity'] != '':
                                         try:
-                                            total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', '')) if float(mat.group(1).replace(',', ''))<50000 else 0
+                                            total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0
                                         except:
                                             log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
+
+
                         if order_begin != "" and order_end != "":
                             order_begin_year = int(order_begin.split("-")[0])
                             order_end_year = int(order_end.split("-")[0])
@@ -3255,7 +3322,7 @@ class ProductAttributesPredictor():
                         order_begin = ""
                         order_end = ""
 
-                        header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p1,self.p2)
+                        header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p0, self.p1,self.p2)
                         if found_header:
                             headers.append('_'.join(header_list))
                             headers_demand.append('_'.join(header_list2))
@@ -3271,7 +3338,7 @@ class ProductAttributesPredictor():
                             id6 = header_dic.get('需求', "")
                             id7 = header_dic.get('预算', "")
                             id8 = header_dic.get('时间', "")
-                            if re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
+                            if id1!='' and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
                                     re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
                                 product = deal_list[id1]
                                 if id2 != "":