Ver código fonte

主要优化地区匹配;产品属性数量标准化,增加数量单位

lsm 2 anos atrás
pai
commit
167442dc4b
2 arquivos alterados com 95 adições e 23 exclusões
  1. 1 1
      BiddingKG/dl/interface/extract.py
  2. 94 22
      BiddingKG/dl/interface/predictor.py

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -245,7 +245,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     '''地区获取'''
     start_time = time.time()
-    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name)
+    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
     cost_time["district"] = round(time.time() - start_time, 2)
 
     '''限制行业最高金额'''

+ 94 - 22
BiddingKG/dl/interface/predictor.py

@@ -683,6 +683,8 @@ class PREMPredict():
             text = text_list[i]
             if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
                 label = 5
+            elif label in [2,3,4] and re.search('序号:\d+,', text):
+                label = 5
             elif label == 2:
                 if re.search('中标单位和.{,25}签订合同', text):
                     label = 0
@@ -1348,7 +1350,7 @@ class RoleRulePredictor():
                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标:否',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
                                                                                                         list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
@@ -2248,6 +2250,7 @@ class ProductAttributesPredictor():
             for td in tds:
                 td_text = re.sub('\s', '', td.get_text())
                 td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/').replace('"', '') # 修复272144312 # 产品单价数量提取结果有特殊符号\  气动执行装置备件\密封组件\NBR+PT
+                td_text = td_text.replace("(", "(").replace(")", ")").replace(':', ':')
                 tr_line.append(td_text)
             inner_table.append(tr_line)
         return inner_table
@@ -2459,9 +2462,10 @@ class ProductAttributesPredictor():
         :return: 表头所在列序号,是否表头,表头内容
         '''
         flag = False
-        header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
+        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
         product = ""  # 产品
         quantity = ""  # 数量
+        quantity_unit = "" # 数量单位
         unitPrice = ""  # 单价
         brand = ""  # 品牌
         specs = ""  # 规格
@@ -2489,9 +2493,12 @@ class ProductAttributesPredictor():
             for j in range(i + 1, len(items)):
                 if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
                     continue
-                if header_dic['数量']=="" and re.search('数量|采购量', items[j]):
+                if header_dic['数量']=="" and re.search('数量|采购量', items[j]) and re.search('单价|用途|要求|规格|型号|运输|承运', items[j])==None:
                     header_dic['数量'] = j
                     quantity = items[j]
+                elif header_dic['单位']=="" and re.search('^(数量单位|计量单位|单位)$', items[j]):
+                    header_dic['单位'] = j
+                    quantity_unit = items[j]
                 elif re.search('单价', items[j]):
                     header_dic['单价'] = j
                     unitPrice = items[j]
@@ -2518,9 +2525,9 @@ class ProductAttributesPredictor():
                     if it != "":
                         num  += 1
                 if num >=2:
-                    return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
+                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
         flag = False
-        return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
+        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
 
     def predict(self, docid='', html='', page_time=""):
         '''
@@ -2553,6 +2560,7 @@ class ProductAttributesPredictor():
             # print(inner_table)
             i = 0
             found_header = False
+            header_quan_unit = ""  # 数量表头 包含单位
             header_colnum = 0
             if flag_yx:
                 col0_l = []
@@ -2613,6 +2621,7 @@ class ProductAttributesPredictor():
                     continue
                 product = ""  # 产品
                 quantity = ""  # 数量
+                quantity_unit = "" # 数量单位
                 unitPrice = ""  # 单价
                 brand = ""  # 品牌
                 specs = ""  # 规格
@@ -2625,6 +2634,13 @@ class ProductAttributesPredictor():
                 if len(set([re.sub('[::]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
                 # if len(set(tds) & self.header_set) > len(tds) * 0.2:
                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
+                    if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
+                            quantity_header = header_list[1].replace('单位:', '')
+                            if re.search('(([\w/]{,5}))', quantity_header):
+                                header_quan_unit = re.search('(([\w/]{,5}))', quantity_header).group(1)
+                            else:
+                                header_quan_unit = ""
+
                     if found_header and len(headers)<1:  # 只保留出现的第一个表头
                         headers.append('_'.join(header_list))
                         headers_demand.append('_'.join(header_list2))
@@ -2638,6 +2654,7 @@ class ProductAttributesPredictor():
                         continue
                     id1 = header_dic.get('名称', "")
                     id2 = header_dic.get('数量', "")
+                    id2_2 = header_dic.get('单位', "")
                     id3 = header_dic.get('单价', "")
                     id4 = header_dic.get('品牌', "")
                     id5 = header_dic.get('规格', "")
@@ -2651,8 +2668,20 @@ class ProductAttributesPredictor():
                         if id2 != "":
                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
                                 quantity = tds[id2]
-                            else:
-                                quantity = ""
+                                quantity = re.sub('[()(),,约]', '', quantity)
+                                quantity = re.sub('[一壹]', '1', quantity)
+                                ser = re.search('^(\d+\.?\d*)([㎡\w/]{,5})', quantity)
+                                if ser:
+                                    quantity = str(ser.group(1))
+                                    quantity_unit = ser.group(2)
+                                    if quantity_unit == "" and header_quan_unit != "":
+                                        quantity_unit = header_quan_unit
+                                else:
+                                    quantity = ""
+                                    quantity_unit = ""
+                        if id2_2 != "":
+                            if re.search('^\w{1,4}$', tds[id2_2]):
+                                quantity_unit = tds[id2_2]
                         if id3 != "":
                             if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
                                 _unitPrice = tds[id3]
@@ -2697,7 +2726,7 @@ class ProductAttributesPredictor():
                             if len(unitPrice) > 15 or len(product)>100:  # 单价大于15位数或 产品名称长于100字
                                 i += 1
                                 continue
-                            link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
+                            link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
                                                       'brand': brand[:50], 'specs':specs}
                             if link not in product_link:
                                 product_link.append(link)
@@ -4238,7 +4267,7 @@ class DistrictPredictor():
             self.short2id = short2id
             self.full2id = full2id
 
-    def predict(self, project_name, prem, title, list_articles, web_source_name = ""):
+    def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
         '''
         先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
         :param project_name:
@@ -4271,8 +4300,8 @@ class DistrictPredictor():
                     for _id in self.full2id[name]:
                         area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
                         # score_l.append([_id, score] + area)
-                        w = self.dist_dic[_id]['权重']
-                        score_l.append([_id, score + w] + area)
+                        # w = self.dist_dic[_id]['权重']
+                        score_l.append([_id, score + 1] + area) # 匹配全称的加1 ,不加权重,因为权重某些赋值不好
 
                 flag = 0
                 for it in re.finditer(self.short_name, text):
@@ -4286,14 +4315,22 @@ class DistrictPredictor():
                             area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
                             if area[0] in ['2', '16', '20', '30']:
                                 _type += 10
+                            if w < 1 and it.end() < len(text) and text[it.end()] in ['省', '市', '县']: # 如果简称后面 有省市县权重改为1
+                                w = 1
                             score2 += w
                             if _id not in id_set:
                                 if _type == 20:
                                     type_w = 3
                                 elif _type == 30:
-                                    type_w = 2
+                                    if it.start()>3 and text[it.start()-1] == '市': # 城市后面 简称不能作为市
+                                        type_w = 0
+                                    else:
+                                        type_w = 2
                                 else:
-                                    type_w = 1
+                                    if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
+                                        type_w = 2
+                                    else:
+                                        type_w = 1
                                 id_set.add(_id)
                                 score2 += w * type_w
                             score_l.append([_id, score * w + score2] + area)
@@ -4344,29 +4381,53 @@ class DistrictPredictor():
                3:地址直接在招标人后面 招标人:xxx,地址:xxx
                4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
             '''
-            p3 = '(招标|采购)(人|单位)(信息:)?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
-            p4 = '(招标|采购)(人|单位)(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
+            p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
+            p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
+            p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
             if re.search(p3, text):
                 return re.search(p3, text).group('addr')
             elif re.search(p4, text):
                 return re.search(p4, text).group('addr')
+            elif re.search(p5, text):
+                return re.search(p5, text).group('addr')
             else:
                 return ''
 
         def get_project_addr(text):
-            p1 = '(项目|建设|工程|服务|交货|送货|收货|)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
+            p1 = '(项目|建设|工程|服务|交货|送货|收货)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
             if re.search(p1, text):
                 return re.search(p1, text).group(0)
             else:
                 return ''
 
         def get_bid_addr(text):
-            p2 = '(磋商|谈判|开标|投标|评标|(采购|招标)(人|单位)|报名|递交|评审|发售)(地址|地点|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
             if re.search(p2, text):
                 return re.search(p2, text).group(0)
             else:
                 return ''
 
+        def get_all_addr(list_entitys):
+            tenderee_l = []
+            other_roles = []
+            addr_l = []
+            for ent in list_entitys[0]:
+                if ent.entity_type == 'location':
+                    addr_l.append(ent.entity_text)
+                elif ent.entity_type in ['org', 'company']:
+                    if ent.label == 0:
+                        tenderee_l.append(ent.entity_text)
+                    else:
+                        other_roles.append(ent.entity_text)
+            return ' '.join(addr_l), ' '.join(tenderee_l), ' '.join(other_roles)
+
+        def get_title_addr(text):
+            p1 = '(\w{2,8}[省市州区县][^\w]*)+'
+            if re.search(p1, text):
+                return re.search(p1, text).group(0)
+            else:
+                return ''
+
         if '##attachment##' in list_articles[0].content:
             content, attachment = list_articles[0].content.split('##attachment##')
             if len(content) < 200:
@@ -4384,9 +4445,13 @@ class DistrictPredictor():
                 tenderee_address = role_addr
 
         if tenderee_address == "":
-            bid_addr = get_bid_addr(content)
-            if bid_addr != "":
-                tenderee_address = bid_addr
+            title_addr = get_title_addr(title)
+            if title_addr != "":
+                tenderee_address = title_addr
+            else:
+                bid_addr = get_bid_addr(content)
+                if bid_addr != "":
+                    tenderee_address = bid_addr
 
         project_name = str(project_name)
         tenderee = str(tenderee)
@@ -4397,15 +4462,22 @@ class DistrictPredictor():
         project_name = project_name.replace(tenderee, '')
 
         text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address)
-        # print('text1:', text1)
 
         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
+        # print('text1:', text1)
         rs = get_area(text1, web_source_name)
 
         if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
-            text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
+            all_addr, tenderees, other_roles = get_all_addr(list_entitys)
+            if tenderees != "":
+                text2 = tenderees + " " + all_addr
+                # print('所有地址:', all_addr)
+            else:
+                text2 = other_roles + " " + all_addr
+                # text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
             text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
+            # print('text2:', text2)
             rs2 = get_area(text2, web_source_name, not_in_content=False)
             rs2['district']['is_in_text'] = True
             if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':