Przeglądaj źródła

Merge remote-tracking branch 'origin/master'

lsm 1 rok temu
rodzic
commit
e79f801017

+ 23 - 14
BiddingKG/dl/interface/getAttributes.py

@@ -1541,6 +1541,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
             _subject = relation[0]
             _object = relation[2]
             if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination:
+                if _subject.in_attachment != _object.in_attachment:
+                    continue
                 if relation[1]==predicate:
                     distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
                             tokens_num_dict[_subject.sentence_index] + _subject.end_index)
@@ -1962,6 +1964,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         match_nums = 0
                         for after_index in range(index + 1, min(len(split_entitys), index + 4)):
                             after_entity = split_entitys[after_index]
+                            if entity.in_attachment != after_entity.in_attachment:
+                                break
                             if after_entity.entity_type in ['person']:
                                 distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
                                                    tokens_num_dict[entity.sentence_index] + entity.end_index)
@@ -2051,21 +2055,24 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                             new_split_list[split_index][1]:
                                         mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace(",", "")
                                         if re.search(key_phone, mid_sentence):
-                                            distance = 1
-                                            if is_same_sentence:
-                                                if phone_begin <= 200:
-                                                    if entity.label in [2,3,4] and phone_begin>80:
-                                                        break
-                                                    value = (-1 / 2 * (distance ** 2)) / 10000
-                                                    match_list2.append(Match(entity, (entity, _phone), value))
-                                                    match_nums += 1
+                                            if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系",mid_sentence[-8:]):
+                                                pass
                                             else:
-                                                if phone_begin <= 60:
-                                                    if entity.label in [2,3,4] and phone_begin>40:
-                                                        break
-                                                    value = (-1 / 2 * (distance ** 2)) / 10000
-                                                    match_list2.append(Match(entity, (entity, _phone), value))
-                                                    match_nums += 1
+                                                distance = 1
+                                                if is_same_sentence:
+                                                    if phone_begin <= 200:
+                                                        if entity.label in [2,3,4] and phone_begin>80:
+                                                            break
+                                                        value = (-1 / 2 * (distance ** 2)) / 10000
+                                                        match_list2.append(Match(entity, (entity, _phone), value))
+                                                        match_nums += 1
+                                                else:
+                                                    if phone_begin <= 60:
+                                                        if entity.label in [2,3,4] and phone_begin>40:
+                                                            break
+                                                        value = (-1 / 2 * (distance ** 2)) / 10000
+                                                        match_list2.append(Match(entity, (entity, _phone), value))
+                                                        match_nums += 1
                         else:
                             next_entity = split_entitys[index + 1]
                             if next_entity.entity_type in ["org","company"]:
@@ -2101,6 +2108,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                         p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
                                         if next_entity.entity_type == 'person' and _phone in p_phone:
                                             pass
+                                        elif entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系", mid_sentence[-8:]):
+                                            pass
                                         else:
                                             distance = (tokens_num_dict[
                                                             next_entity.sentence_index] + next_entity.begin_index) - (

+ 111 - 18
BiddingKG/dl/interface/predictor.py

@@ -3322,6 +3322,7 @@ class ProductAttributesPredictor():
                     header_col = []
                     product_link = []
                     demand_link = []
+                    product_set = set()
                     for idx in range(len(begin_list)):
                         if idx==len(begin_list)-1:
                             deal_list = head_value_list[begin_list[idx]:]
@@ -3340,6 +3341,8 @@ class ProductAttributesPredictor():
                         order_time = ""  # 采购时间
                         order_begin = ""
                         order_end = ""
+                        total_price = ""  # 总金额
+                        parameter = ""  # 参数
 
                         header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p0, self.p1,self.p2)
                         if found_header:
@@ -3347,6 +3350,7 @@ class ProductAttributesPredictor():
                             headers_demand.append('_'.join(header_list2))
                             header_col.append('_'.join(tmp_head_list))
                             # print('header_dic: ',header_dic)
+                            id0 = header_dic.get('品目', "")
                             id1 = header_dic.get('名称', "")
                             id2 = header_dic.get('数量', "")
                             id2_2 = header_dic.get('单位', "")
@@ -3357,9 +3361,18 @@ class ProductAttributesPredictor():
                             id6 = header_dic.get('需求', "")
                             id7 = header_dic.get('预算', "")
                             id8 = header_dic.get('时间', "")
+
+                            id9 = header_dic.get("总价", "")
+                            id10 = header_dic.get('参数', "")
                             if id1!='' and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
                                     re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
                                 product = deal_list[id1]
+                            if id0 != "" and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id0]) and deal_list[id0] not in self.header_set and \
+                                    re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id0]) == None:
+                                category = deal_list[id0]
+                                product = "%s_%s" % (category, product) if product != "" else category
+
+                            if product != "":
                                 if id2 != "":
                                     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
                                         quantity = deal_list[id2]
@@ -3400,11 +3413,15 @@ class ProductAttributesPredictor():
                                 if id4 != "":
                                     if re.search('\w', deal_list[id4]):
                                         brand = deal_list[id4]
+                                        if re.match('^详见|^略$', brand.strip()):
+                                            brand = ""
                                     else:
                                         brand = ""
                                 if id5 != "":
                                     if re.search('\w', deal_list[id5]):
-                                        specs = deal_list[id5]
+                                        specs = deal_list[id5][:500]
+                                        if re.match('^详见|^略$', specs.strip()):
+                                            brand = ""
                                     else:
                                         specs = ""
                                 if id6 != "":
@@ -3424,28 +3441,105 @@ class ProductAttributesPredictor():
                                             if float(budget)>= 100000*10000:
                                                 budget = ""
                                 if id8 != "":
-                                    if re.search('\w', deal_list[id8]):
+                                    if re.search('\w', deal_list[id8]) and re.search("采购(实施)?(时间|月份|日期)",header_list2[3]):
                                         order_time = deal_list[id8].strip()
                                         order_begin, order_end = self.fix_time(order_time, html, page_time)
-                                # print(quantity,unitPrice,brand,specs)
-                                if quantity != "" or unitPrice != "" or brand != "" or specs != "":
-                                    link = {'product': product, 'quantity': quantity, 'quantity_unit':quantity_unit,'unitPrice': unitPrice,
-                                            'brand': brand[:50], 'specs': specs}
-                                    if link not in product_link:
-                                        product_link.append(link)
-                                        # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
-                                        # if link['unitPrice'] != "" and mat:
-                                        #     try:
-                                        #         total_product_money += float(link['unitPrice']) * float(
-                                        #             mat.group(1).replace(',', ''))
-                                        #     except:
-                                        #         log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
-                                        #         link['unitPrice'], link['quantity']))
+                                if id9 != "":
+                                    if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id9]):
+                                        total_price = deal_list[id9]
+                                    elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', deal_list[id9].strip()):
+                                        total_price = deal_list[id9]
+                                if id10 != "":
+                                    parameter = deal_list[id10][:500]
+                                    if re.match('^详见|^略$', parameter.strip()):
+                                        parameter = ""
+                                if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
+                                    if id2 != "" and id3 != "" and len(re.split('[;;、,\n]', deal_list[id2])) > 1 and len(
+                                            re.split('[;;、,\n]', deal_list[id1])) == len(re.split('[;;、,\n]', deal_list[id2])):  # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
+                                        products = re.split('[;;、,\n]', deal_list[id1])
+                                        quantitys = re.split('[;;、,\n]', deal_list[id2])
+                                        unitPrices = re.split('[;;、,\n]', deal_list[id3])
+                                        total_prices = re.split('[;;、,\n]', total_price)
+                                        brands = re.split('[;;、,\n]', brand) if re.search('等$', brand) == None else [brand]
+                                        specses = re.split('[;;、,\n]', specs) if re.search('等$', specs) == None else [specs]
+                                        parameters = re.split('[;;、,\n]', parameter) if re.search('等$', parameter) == None else [parameter]
+                                        unitPrices = [""] * len(products) if len(unitPrices) == 1 else unitPrices
+                                        total_prices = [""] * len(products) if len(total_prices) == 1 else total_prices
+                                        brands = brands * len(products) if len(brands) == 1 else brands
+                                        specses = specses * len(products) if len(specses) == 1 else specses
+                                        parameters = parameters * len(products) if len(parameters) == 1 else parameters
+                                        if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(
+                                                specses):
+                                            for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(
+                                                    products, quantitys, unitPrices, brands, specses, total_prices,
+                                                    parameters):
+                                                if quantity != "":
+                                                    quantity, quantity_unit_ = self.fix_quantity(quantity,quantity_unit)
+                                                    quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
+                                                if unitPrice != "":
+                                                    unitPrice, _money_unit = money_process(unitPrice, header_list[3])
+                                                    unitPrice = str(unitPrice) if unitPrice != 0 else ""
+                                                if budget != "":
+                                                    budget, _money_unit = money_process(budget, header_list2[2])
+                                                    budget = str(budget) if budget != 0 else ''
+                                                if total_price != "":
+                                                    total_price, _money_unit = money_process(total_price,
+                                                                                             header_list[6])
+                                                    total_price = str(total_price) if unitPrice != 0 else ""
+                                                link = {'product': product, 'quantity': quantity,
+                                                        'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
+                                                        'brand': brand[:50], 'specs': specs, 'total_price': total_price,
+                                                        'parameter': parameter}
+
+                                                if (product, specs, unitPrice, quantity) not in product_set:
+                                                    product_set.add((product, specs, unitPrice, quantity))
+                                                    product_link.append(link)
+                                                    # if link['unitPrice'] != "" and link['quantity'] != '':
+                                                    #     try:
+                                                    #         total_product_money += float(link['unitPrice']) * float(
+                                                    #             link['quantity']) if float(
+                                                    #             link['quantity']) < 50000 else 0
+                                                    #     except:
+                                                    #         log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
+                                                    #             link['unitPrice'], link['quantity']))
+
+                                    elif len(unitPrice) > 15 or len(product) > 100:  # 单价大于15位数或 产品名称长于100字
+                                        # i += 1
+                                        continue
+                                    else:
+                                        if quantity != "":
+                                            quantity, quantity_unit_ = self.fix_quantity(quantity, quantity_unit)
+                                            quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
+                                        if unitPrice != "":
+                                            unitPrice, _money_unit = money_process(unitPrice, header_list[3])
+                                            unitPrice = str(unitPrice) if unitPrice != 0 else ""
+                                        if budget != "":
+                                            budget, _money_unit = money_process(budget, header_list2[2])
+                                            budget = str(budget) if budget != 0 else ''
+                                        if total_price != "":
+                                            total_price, _money_unit = money_process(total_price, header_list[6])
+                                            total_price = str(total_price) if unitPrice != 0 else ""
+                                        link = {'product': product, 'quantity': quantity,
+                                                'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
+                                                'brand': brand[:50], 'specs': specs, 'total_price': total_price,
+                                                'parameter': parameter}
+
+                                        if (product, specs, unitPrice, quantity) not in product_set:
+                                            product_set.add((product, specs, unitPrice, quantity))
+                                            product_link.append(link)
+                                            # if link['unitPrice'] != "" and link['quantity'] != '':
+                                            #     try:
+                                            #         total_product_money += float(link['unitPrice']) * float(
+                                            #             link['quantity']) if float(link['quantity']) < 50000 else 0
+                                            #     except:
+                                            #         log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
+                                            #         link['unitPrice'], link['quantity']))
+
                                 if order_begin != "" and order_end != "":
                                     order_begin_year = int(order_begin.split("-")[0])
                                     order_end_year = int(order_end.split("-")[0])
                                     # 限制附件错误识别时间
-                                    if order_begin_year >= 2050 or order_end_year >= 2050:
+                                    if order_begin_year >= 2050 or order_begin_year < 2000 or order_end_year >= 2050 or order_end_year < 2000:
                                         order_begin = order_end = ""
                                 # print(budget, order_time)
                                 if budget != "" and order_time != "":
@@ -3463,7 +3557,6 @@ class ProductAttributesPredictor():
                         demand_dic = {'demand_info': {'data': demand_link, 'header': headers_demand, 'header_col': header_col}}
                     else:
                         demand_dic = {'demand_info': {'data': [], 'header': [], 'header_col': []}}
-
                     product_attrs[0] = attr_dic
                     if len(product_attrs[1]['demand_info']['data']) == 0:
                         product_attrs[1] = demand_dic