%!s(int64=2) %!d(string=hai) anos · 167442dc4b
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -245,7 +245,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
															     '''地区获取'''
														
 
															     start_time = time.time()
														
 
															-    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name)
														
 
															+    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
														
 
															     cost_time["district"] = round(time.time() - start_time, 2)
														
 
															     '''限制行业最高金额'''
														
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -683,6 +683,8 @@ class PREMPredict():
 
															             text = text_list[i]
														
 
															             if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他，让后面的规则召回重新判断
														
 
															                 label = 5
														
 
															+            elif label in [2,3,4] and re.search('序号：\d+，', text):
														
 
															+                label = 5
														
 
															             elif label == 2:
														
 
															                 if re.search('中标单位和.{,25}签订合同', text):
														
 
															                     label = 0
														
@@ -1348,7 +1350,7 @@ class RoleRulePredictor():
 
															                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
														
 
															                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
														
 
															                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
														
 
															-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标：否',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
														
 
															+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标：否|序号：\d+',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
														
 
															                                                                                                         list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
														
 
															                                                         _flag = True
														
 
															                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
														
@@ -2248,6 +2250,7 @@ class ProductAttributesPredictor():
 
															             for td in tds:
														
 
															                 td_text = re.sub('\s', '', td.get_text())
														
 
															                 td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/').replace('"', '') # 修复272144312 # 产品单价数量提取结果有特殊符号\  气动执行装置备件\密封组件\NBR+PT
														
 
															+                td_text = td_text.replace("(", "（").replace(")", "）").replace(':', '：')
														
 
															                 tr_line.append(td_text)
														
 
															             inner_table.append(tr_line)
														
 
															         return inner_table
														
@@ -2459,9 +2462,10 @@ class ProductAttributesPredictor():
 
															         :return: 表头所在列序号，是否表头，表头内容
														
 
															         '''
														
 
															         flag = False
														
 
															-        header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
														
 
															+        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
														
 
															         product = ""  # 产品
														
 
															         quantity = ""  # 数量
														
 
															+        quantity_unit = "" # 数量单位
														
 
															         unitPrice = ""  # 单价
														
 
															         brand = ""  # 品牌
														
 
															         specs = ""  # 规格
														
@@ -2489,9 +2493,12 @@ class ProductAttributesPredictor():
 
															             for j in range(i + 1, len(items)):
														
 
															                 if len(items[j]) > 20 and len(re.sub('[\(（].*[）\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
														
 
															                     continue
														
 
															-                if header_dic['数量']=="" and re.search('数量|采购量', items[j]):
														
 
															+                if header_dic['数量']=="" and re.search('数量|采购量', items[j]) and re.search('单价|用途|要求|规格|型号|运输|承运', items[j])==None:
														
 
															                     header_dic['数量'] = j
														
 
															                     quantity = items[j]
														
 
															+                elif header_dic['单位']=="" and re.search('^(数量单位|计量单位|单位)$', items[j]):
														
 
															+                    header_dic['单位'] = j
														
 
															+                    quantity_unit = items[j]
														
 
															                 elif re.search('单价', items[j]):
														
 
															                     header_dic['单价'] = j
														
 
															                     unitPrice = items[j]
														
@@ -2518,9 +2525,9 @@ class ProductAttributesPredictor():
 
															                     if it != "":
														
 
															                         num  += 1
														
 
															                 if num >=2:
														
 
															-                    return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
														
 
															+                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
														
 
															         flag = False
														
 
															-        return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
														
 
															+        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
														
 
															     def predict(self, docid='', html='', page_time=""):
														
 
															         '''
														
@@ -2553,6 +2560,7 @@ class ProductAttributesPredictor():
 
															             # print(inner_table)
														
 
															             i = 0
														
 
															             found_header = False
														
 
															+            header_quan_unit = ""  # 数量表头 包含单位
														
 
															             header_colnum = 0
														
 
															             if flag_yx:
														
 
															                 col0_l = []
														
@@ -2613,6 +2621,7 @@ class ProductAttributesPredictor():
 
															                     continue
														
 
															                 product = ""  # 产品
														
 
															                 quantity = ""  # 数量
														
 
															+                quantity_unit = "" # 数量单位
														
 
															                 unitPrice = ""  # 单价
														
 
															                 brand = ""  # 品牌
														
 
															                 specs = ""  # 规格
														
@@ -2625,6 +2634,13 @@ class ProductAttributesPredictor():
 
															                 if len(set([re.sub('[:：]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
														
 
															                 # if len(set(tds) & self.header_set) > len(tds) * 0.2:
														
 
															                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
														
 
															+                    if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
														
 
															+                            quantity_header = header_list[1].replace('单位：', '')
														
 
															+                            if re.search('（([\w/]{,5})）', quantity_header):
														
 
															+                                header_quan_unit = re.search('（([\w/]{,5})）', quantity_header).group(1)
														
 
															+                            else:
														
 
															+                                header_quan_unit = ""
														
 
															+
														
 
															                     if found_header and len(headers)<1:  # 只保留出现的第一个表头
														
 
															                         headers.append('_'.join(header_list))
														
 
															                         headers_demand.append('_'.join(header_list2))
														
@@ -2638,6 +2654,7 @@ class ProductAttributesPredictor():
 
															                         continue
														
 
															                     id1 = header_dic.get('名称', "")
														
 
															                     id2 = header_dic.get('数量', "")
														
 
															+                    id2_2 = header_dic.get('单位', "")
														
 
															                     id3 = header_dic.get('单价', "")
														
 
															                     id4 = header_dic.get('品牌', "")
														
 
															                     id5 = header_dic.get('规格', "")
														
@@ -2651,8 +2668,20 @@ class ProductAttributesPredictor():
 
															                         if id2 != "":
														
 
															                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
														
 
															                                 quantity = tds[id2]
														
 
															-                            else:
														
 
															-                                quantity = ""
														
 
															+                                quantity = re.sub('[()（）,，约]', '', quantity)
														
 
															+                                quantity = re.sub('[一壹]', '1', quantity)
														
 
															+                                ser = re.search('^(\d+\.?\d*)([㎡\w/]{,5})', quantity)
														
 
															+                                if ser:
														
 
															+                                    quantity = str(ser.group(1))
														
 
															+                                    quantity_unit = ser.group(2)
														
 
															+                                    if quantity_unit == "" and header_quan_unit != "":
														
 
															+                                        quantity_unit = header_quan_unit
														
 
															+                                else:
														
 
															+                                    quantity = ""
														
 
															+                                    quantity_unit = ""
														
 
															+                        if id2_2 != "":
														
 
															+                            if re.search('^\w{1,4}$', tds[id2_2]):
														
 
															+                                quantity_unit = tds[id2_2]
														
 
															                         if id3 != "":
														
 
															                             if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
														
 
															                                 _unitPrice = tds[id3]
														
@@ -2697,7 +2726,7 @@ class ProductAttributesPredictor():
 
															                             if len(unitPrice) > 15 or len(product)>100:  # 单价大于15位数或 产品名称长于100字
														
 
															                                 i += 1
														
 
															                                 continue
														
 
															-                            link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
														
 
															+                            link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
														
 
															                                                       'brand': brand[:50], 'specs':specs}
														
 
															                             if link not in product_link:
														
 
															                                 product_link.append(link)
														
@@ -4238,7 +4267,7 @@ class DistrictPredictor():
 
															             self.short2id = short2id
														
 
															             self.full2id = full2id
														
 
															-    def predict(self, project_name, prem, title, list_articles, web_source_name = ""):
														
 
															+    def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
														
 
															         '''
														
 
															         先匹配 project_name+tenderee+tenderee_address， 如果缺少省或市 再匹配 title+content
														
 
															         :param project_name:
														
@@ -4271,8 +4300,8 @@ class DistrictPredictor():
 
															                     for _id in self.full2id[name]:
														
 
															                         area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
														
 
															                         # score_l.append([_id, score] + area)
														
 
															-                        w = self.dist_dic[_id]['权重']
														
 
															-                        score_l.append([_id, score + w] + area)
														
 
															+                        # w = self.dist_dic[_id]['权重']
														
 
															+                        score_l.append([_id, score + 1] + area) # 匹配全称的加1 ，不加权重，因为权重某些赋值不好
														
 
															                 flag = 0
														
 
															                 for it in re.finditer(self.short_name, text):
														
@@ -4286,14 +4315,22 @@ class DistrictPredictor():
 
															                             area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
														
 
															                             if area[0] in ['2', '16', '20', '30']:
														
 
															                                 _type += 10
														
 
															+                            if w < 1 and it.end() < len(text) and text[it.end()] in ['省', '市', '县']: # 如果简称后面 有省市县权重改为1
														
 
															+                                w = 1
														
 
															                             score2 += w
														
 
															                             if _id not in id_set:
														
 
															                                 if _type == 20:
														
 
															                                     type_w = 3
														
 
															                                 elif _type == 30:
														
 
															-                                    type_w = 2
														
 
															+                                    if it.start()>3 and text[it.start()-1] == '市': # 城市后面 简称不能作为市
														
 
															+                                        type_w = 0
														
 
															+                                    else:
														
 
															+                                        type_w = 2
														
 
															                                 else:
														
 
															-                                    type_w = 1
														
 
															+                                    if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
														
 
															+                                        type_w = 2
														
 
															+                                    else:
														
 
															+                                        type_w = 1
														
 
															                                 id_set.add(_id)
														
 
															                                 score2 += w * type_w
														
 
															                             score_l.append([_id, score * w + score2] + area)
														
@@ -4344,29 +4381,53 @@ class DistrictPredictor():
 
															                3：地址直接在招标人后面 招标人：xxx,地址：xxx
														
 
															                4：招标、代理一起，两个地址一起 招标人：xxx， 代理人：xxx, 地址：xxx， 地址：xxx.
														
 
															             '''
														
 
															-            p3 = '(招标|采购)(人|单位)(信息：)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
														
 
															-            p4 = '(招标|采购)(人|单位)(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
														
 
															+            p3 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
														
 
															+            p4 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
														
 
															+            p5 = '(采购|招标)(人|单位)(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
														
 
															             if re.search(p3, text):
														
 
															                 return re.search(p3, text).group('addr')
														
 
															             elif re.search(p4, text):
														
 
															                 return re.search(p4, text).group('addr')
														
 
															+            elif re.search(p5, text):
														
 
															+                return re.search(p5, text).group('addr')
														
 
															             else:
														
 
															                 return ''
														
 
															         def get_project_addr(text):
														
 
															-            p1 = '(项目|建设|工程|服务|交货|送货|收货|)(地址|地点|位置|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
														
 
															+            p1 = '(项目|建设|工程|服务|交货|送货|收货)(地址|地点|位置|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
														
 
															             if re.search(p1, text):
														
 
															                 return re.search(p1, text).group(0)
														
 
															             else:
														
 
															                 return ''
														
 
															         def get_bid_addr(text):
														
 
															-            p2 = '(磋商|谈判|开标|投标|评标|(采购|招标)(人|单位)|报名|递交|评审|发售)(地址|地点|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
														
 
															+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
														
 
															             if re.search(p2, text):
														
 
															                 return re.search(p2, text).group(0)
														
 
															             else:
														
 
															                 return ''
														
 
															+        def get_all_addr(list_entitys):
														
 
															+            tenderee_l = []
														
 
															+            other_roles = []
														
 
															+            addr_l = []
														
 
															+            for ent in list_entitys[0]:
														
 
															+                if ent.entity_type == 'location':
														
 
															+                    addr_l.append(ent.entity_text)
														
 
															+                elif ent.entity_type in ['org', 'company']:
														
 
															+                    if ent.label == 0:
														
 
															+                        tenderee_l.append(ent.entity_text)
														
 
															+                    else:
														
 
															+                        other_roles.append(ent.entity_text)
														
 
															+            return ' '.join(addr_l), ' '.join(tenderee_l), ' '.join(other_roles)
														
 
															+
														
 
															+        def get_title_addr(text):
														
 
															+            p1 = '(\w{2,8}[省市州区县][^\w]*)+'
														
 
															+            if re.search(p1, text):
														
 
															+                return re.search(p1, text).group(0)
														
 
															+            else:
														
 
															+                return ''
														
 
															+
														
 
															         if '##attachment##' in list_articles[0].content:
														
 
															             content, attachment = list_articles[0].content.split('##attachment##')
														
 
															             if len(content) < 200:
														
@@ -4384,9 +4445,13 @@ class DistrictPredictor():
 
															                 tenderee_address = role_addr
														
 
															         if tenderee_address == "":
														
 
															-            bid_addr = get_bid_addr(content)
														
 
															-            if bid_addr != "":
														
 
															-                tenderee_address = bid_addr
														
 
															+            title_addr = get_title_addr(title)
														
 
															+            if title_addr != "":
														
 
															+                tenderee_address = title_addr
														
 
															+            else:
														
 
															+                bid_addr = get_bid_addr(content)
														
 
															+                if bid_addr != "":
														
 
															+                    tenderee_address = bid_addr
														
 
															         project_name = str(project_name)
														
 
															         tenderee = str(tenderee)
														
@@ -4397,15 +4462,22 @@ class DistrictPredictor():
 
															         project_name = project_name.replace(tenderee, '')
														
 
															         text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address)
														
 
															-        # print('text1:', text1)
														
 
															         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
														
 
															         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
														
 
															+        # print('text1:', text1)
														
 
															         rs = get_area(text1, web_source_name)
														
 
															         if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
														
 
															-            text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
														
 
															+            all_addr, tenderees, other_roles = get_all_addr(list_entitys)
														
 
															+            if tenderees != "":
														
 
															+                text2 = tenderees + " " + all_addr
														
 
															+                # print('所有地址：', all_addr)
														
 
															+            else:
														
 
															+                text2 = other_roles + " " + all_addr
														
 
															+                # text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
														
 
															             text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
														
 
															+            # print('text2:', text2)
														
 
															             rs2 = get_area(text2, web_source_name, not_in_content=False)
														
 
															             rs2['district']['is_in_text'] = True
														
 
															             if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':