Browse Source

产品属性新增提取类型(文本内容预测)

znj 1 year ago
parent
commit
32918dd441
1 changed files with 112 additions and 16 deletions
  1. 112 16
      BiddingKG/dl/interface/predictor.py

+ 112 - 16
BiddingKG/dl/interface/predictor.py

@@ -3303,6 +3303,7 @@ class ProductAttributesPredictor():
                     header_col = []
                     product_link = []
                     demand_link = []
+                    product_set = set()
                     for idx in range(len(begin_list)):
                         if idx==len(begin_list)-1:
                             deal_list = head_value_list[begin_list[idx]:]
@@ -3321,6 +3322,8 @@ class ProductAttributesPredictor():
                         order_time = ""  # 采购时间
                         order_begin = ""
                         order_end = ""
+                        total_price = ""  # 总金额
+                        parameter = ""  # 参数
 
                         header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p0, self.p1,self.p2)
                         if found_header:
@@ -3328,6 +3331,7 @@ class ProductAttributesPredictor():
                             headers_demand.append('_'.join(header_list2))
                             header_col.append('_'.join(tmp_head_list))
                             # print('header_dic: ',header_dic)
+                            id0 = header_dic.get('品目', "")
                             id1 = header_dic.get('名称', "")
                             id2 = header_dic.get('数量', "")
                             id2_2 = header_dic.get('单位', "")
@@ -3338,9 +3342,18 @@ class ProductAttributesPredictor():
                             id6 = header_dic.get('需求', "")
                             id7 = header_dic.get('预算', "")
                             id8 = header_dic.get('时间', "")
+
+                            id9 = header_dic.get("总价", "")
+                            id10 = header_dic.get('参数', "")
                             if id1!='' and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
                                     re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
                                 product = deal_list[id1]
+                            if id0 != "" and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id0]) and deal_list[id0] not in self.header_set and \
+                                    re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id0]) == None:
+                                category = deal_list[id0]
+                                product = "%s_%s" % (category, product) if product != "" else category
+
+                            if product != "":
                                 if id2 != "":
                                     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
                                         quantity = deal_list[id2]
@@ -3381,11 +3394,15 @@ class ProductAttributesPredictor():
                                 if id4 != "":
                                     if re.search('\w', deal_list[id4]):
                                         brand = deal_list[id4]
+                                        if re.match('^详见|^略$', brand.strip()):
+                                            brand = ""
                                     else:
                                         brand = ""
                                 if id5 != "":
                                     if re.search('\w', deal_list[id5]):
-                                        specs = deal_list[id5]
+                                        specs = deal_list[id5][:500]
+                                        if re.match('^详见|^略$', specs.strip()):
+                                            brand = ""
                                     else:
                                         specs = ""
                                 if id6 != "":
@@ -3408,25 +3425,104 @@ class ProductAttributesPredictor():
                                     if re.search('\w', deal_list[id8]) and re.search("采购(实施)?(时间|月份|日期)",header_list2[3]):
                                         order_time = deal_list[id8].strip()
                                         order_begin, order_end = self.fix_time(order_time, html, page_time)
-                                # print(quantity,unitPrice,brand,specs)
-                                if quantity != "" or unitPrice != "" or brand != "" or specs != "":
-                                    link = {'product': product, 'quantity': quantity, 'quantity_unit':quantity_unit,'unitPrice': unitPrice,
-                                            'brand': brand[:50], 'specs': specs}
-                                    if link not in product_link:
-                                        product_link.append(link)
-                                        # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
-                                        # if link['unitPrice'] != "" and mat:
-                                        #     try:
-                                        #         total_product_money += float(link['unitPrice']) * float(
-                                        #             mat.group(1).replace(',', ''))
-                                        #     except:
-                                        #         log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
-                                        #         link['unitPrice'], link['quantity']))
+                                if id9 != "":
+                                    if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id9]):
+                                        total_price = deal_list[id9]
+                                    elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', deal_list[id9].strip()):
+                                        total_price = deal_list[id9]
+                                if id10 != "":
+                                    parameter = deal_list[id10][:500]
+                                    if re.match('^详见|^略$', parameter.strip()):
+                                        parameter = ""
+                                if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
+                                    if id2 != "" and id3 != "" and len(re.split('[;;、,\n]', deal_list[id2])) > 1 and len(
+                                            re.split('[;;、,\n]', deal_list[id1])) == len(re.split('[;;、,\n]', deal_list[
+                                        id2])):  # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
+                                        products = re.split('[;;、,\n]', deal_list[id1])
+                                        quantitys = re.split('[;;、,\n]', deal_list[id2])
+                                        unitPrices = re.split('[;;、,\n]', deal_list[id3])
+                                        total_prices = re.split('[;;、,\n]', total_price)
+                                        brands = re.split('[;;、,\n]', brand) if re.search('等$', brand) == None else [brand]
+                                        specses = re.split('[;;、,\n]', specs) if re.search('等$', specs) == None else [specs]
+                                        parameters = re.split('[;;、,\n]', parameter) if re.search('等$', parameter) == None else [parameter]
+                                        unitPrices = [""] * len(products) if len(unitPrices) == 1 else unitPrices
+                                        total_prices = [""] * len(products) if len(total_prices) == 1 else total_prices
+                                        brands = brands * len(products) if len(brands) == 1 else brands
+                                        specses = specses * len(products) if len(specses) == 1 else specses
+                                        parameters = parameters * len(products) if len(parameters) == 1 else parameters
+                                        if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(
+                                                specses):
+                                            for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(
+                                                    products, quantitys, unitPrices, brands, specses, total_prices,
+                                                    parameters):
+                                                # if quantity != "":
+                                                #     quantity, quantity_unit_ = self.fix_quantity(quantity,
+                                                #                                                  header_quan_unit)
+                                                #     quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
+                                                if unitPrice != "":
+                                                    unitPrice, _money_unit = money_process(unitPrice, header_list[3])
+                                                    unitPrice = str(unitPrice) if unitPrice != 0 else ""
+                                                if budget != "":
+                                                    budget, _money_unit = money_process(budget, header_list2[2])
+                                                    budget = str(budget) if budget != 0 else ''
+                                                if total_price != "":
+                                                    total_price, _money_unit = money_process(total_price,
+                                                                                             header_list[6])
+                                                    total_price = str(total_price) if unitPrice != 0 else ""
+                                                link = {'product': product, 'quantity': quantity,
+                                                        'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
+                                                        'brand': brand[:50], 'specs': specs, 'total_price': total_price,
+                                                        'parameter': parameter}
+
+                                                if (product, specs, unitPrice, quantity) not in product_set:
+                                                    product_set.add((product, specs, unitPrice, quantity))
+                                                    product_link.append(link)
+                                                    # if link['unitPrice'] != "" and link['quantity'] != '':
+                                                    #     try:
+                                                    #         total_product_money += float(link['unitPrice']) * float(
+                                                    #             link['quantity']) if float(
+                                                    #             link['quantity']) < 50000 else 0
+                                                    #     except:
+                                                    #         log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
+                                                    #             link['unitPrice'], link['quantity']))
+
+                                    elif len(unitPrice) > 15 or len(product) > 100:  # 单价大于15位数或 产品名称长于100字
+                                        # i += 1
+                                        continue
+                                    else:
+                                        # if quantity != "":
+                                        #     quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
+                                        #     quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
+                                        if unitPrice != "":
+                                            unitPrice, _money_unit = money_process(unitPrice, header_list[3])
+                                            unitPrice = str(unitPrice) if unitPrice != 0 else ""
+                                        if budget != "":
+                                            budget, _money_unit = money_process(budget, header_list2[2])
+                                            budget = str(budget) if budget != 0 else ''
+                                        if total_price != "":
+                                            total_price, _money_unit = money_process(total_price, header_list[6])
+                                            total_price = str(total_price) if unitPrice != 0 else ""
+                                        link = {'product': product, 'quantity': quantity,
+                                                'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
+                                                'brand': brand[:50], 'specs': specs, 'total_price': total_price,
+                                                'parameter': parameter}
+
+                                        if (product, specs, unitPrice, quantity) not in product_set:
+                                            product_set.add((product, specs, unitPrice, quantity))
+                                            product_link.append(link)
+                                            # if link['unitPrice'] != "" and link['quantity'] != '':
+                                            #     try:
+                                            #         total_product_money += float(link['unitPrice']) * float(
+                                            #             link['quantity']) if float(link['quantity']) < 50000 else 0
+                                            #     except:
+                                            #         log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
+                                            #         link['unitPrice'], link['quantity']))
+
                                 if order_begin != "" and order_end != "":
                                     order_begin_year = int(order_begin.split("-")[0])
                                     order_end_year = int(order_end.split("-")[0])
                                     # 限制附件错误识别时间
-                                    if order_begin_year >= 2050 or order_end_year >= 2050:
+                                    if order_begin_year >= 2050 or order_begin_year < 2000 or order_end_year >= 2050 or order_end_year < 2000:
                                         order_begin = order_end = ""
                                 # print(budget, order_time)
                                 if budget != "" and order_time != "":