2 anos atrás · 167442dc4b
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -245,7 +245,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     '''地区获取'''
			
 
				     start_time = time.time()
			
 
				-    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name)
			
 
				+    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
			
 
				     cost_time["district"] = round(time.time() - start_time, 2)
			
 
				 
			
 
				     '''限制行业最高金额'''
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -683,6 +683,8 @@ class PREMPredict():
 
				             text = text_list[i]
			
 
				             if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他，让后面的规则召回重新判断
			
 
				                 label = 5
			
 
				+            elif label in [2,3,4] and re.search('序号：\d+，', text):
			
 
				+                label = 5
			
 
				             elif label == 2:
			
 
				                 if re.search('中标单位和.{,25}签订合同', text):
			
 
				                     label = 0
			
@@ -1348,7 +1350,7 @@ class RoleRulePredictor():
 
				                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
			
 
				                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
			
 
				                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
			
 
				-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标：否',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
			
 
				+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标：否|序号：\d+',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
			
 
				                                                                                                         list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
			
 
				                                                         _flag = True
			
 
				                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
			
@@ -2248,6 +2250,7 @@ class ProductAttributesPredictor():
 
				             for td in tds:
			
 
				                 td_text = re.sub('\s', '', td.get_text())
			
 
				                 td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/').replace('"', '') # 修复272144312 # 产品单价数量提取结果有特殊符号\  气动执行装置备件\密封组件\NBR+PT
			
 
				+                td_text = td_text.replace("(", "（").replace(")", "）").replace(':', '：')
			
 
				                 tr_line.append(td_text)
			
 
				             inner_table.append(tr_line)
			
 
				         return inner_table
			
@@ -2459,9 +2462,10 @@ class ProductAttributesPredictor():
 
				         :return: 表头所在列序号，是否表头，表头内容
			
 
				         '''
			
 
				         flag = False
			
 
				-        header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
			
 
				+        header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
			
 
				         product = ""  # 产品
			
 
				         quantity = ""  # 数量
			
 
				+        quantity_unit = "" # 数量单位
			
 
				         unitPrice = ""  # 单价
			
 
				         brand = ""  # 品牌
			
 
				         specs = ""  # 规格
			
@@ -2489,9 +2493,12 @@ class ProductAttributesPredictor():
 
				             for j in range(i + 1, len(items)):
			
 
				                 if len(items[j]) > 20 and len(re.sub('[\(（].*[）\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
			
 
				                     continue
			
 
				-                if header_dic['数量']=="" and re.search('数量|采购量', items[j]):
			
 
				+                if header_dic['数量']=="" and re.search('数量|采购量', items[j]) and re.search('单价|用途|要求|规格|型号|运输|承运', items[j])==None:
			
 
				                     header_dic['数量'] = j
			
 
				                     quantity = items[j]
			
 
				+                elif header_dic['单位']=="" and re.search('^(数量单位|计量单位|单位)$', items[j]):
			
 
				+                    header_dic['单位'] = j
			
 
				+                    quantity_unit = items[j]
			
 
				                 elif re.search('单价', items[j]):
			
 
				                     header_dic['单价'] = j
			
 
				                     unitPrice = items[j]
			
@@ -2518,9 +2525,9 @@ class ProductAttributesPredictor():
 
				                     if it != "":
			
 
				                         num  += 1
			
 
				                 if num >=2:
			
 
				-                    return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
			
 
				+                    return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
			
 
				         flag = False
			
 
				-        return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
			
 
				+        return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
			
 
				 
			
 
				     def predict(self, docid='', html='', page_time=""):
			
 
				         '''
			
@@ -2553,6 +2560,7 @@ class ProductAttributesPredictor():
 
				             # print(inner_table)
			
 
				             i = 0
			
 
				             found_header = False
			
 
				+            header_quan_unit = ""  # 数量表头 包含单位
			
 
				             header_colnum = 0
			
 
				             if flag_yx:
			
 
				                 col0_l = []
			
@@ -2613,6 +2621,7 @@ class ProductAttributesPredictor():
 
				                     continue
			
 
				                 product = ""  # 产品
			
 
				                 quantity = ""  # 数量
			
 
				+                quantity_unit = "" # 数量单位
			
 
				                 unitPrice = ""  # 单价
			
 
				                 brand = ""  # 品牌
			
 
				                 specs = ""  # 规格
			
@@ -2625,6 +2634,13 @@ class ProductAttributesPredictor():
 
				                 if len(set([re.sub('[:：]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
			
 
				                 # if len(set(tds) & self.header_set) > len(tds) * 0.2:
			
 
				                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
			
 
				+                    if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
			
 
				+                            quantity_header = header_list[1].replace('单位：', '')
			
 
				+                            if re.search('（([\w/]{,5})）', quantity_header):
			
 
				+                                header_quan_unit = re.search('（([\w/]{,5})）', quantity_header).group(1)
			
 
				+                            else:
			
 
				+                                header_quan_unit = ""
			
 
				+
			
 
				                     if found_header and len(headers)<1:  # 只保留出现的第一个表头
			
 
				                         headers.append('_'.join(header_list))
			
 
				                         headers_demand.append('_'.join(header_list2))
			
@@ -2638,6 +2654,7 @@ class ProductAttributesPredictor():
 
				                         continue
			
 
				                     id1 = header_dic.get('名称', "")
			
 
				                     id2 = header_dic.get('数量', "")
			
 
				+                    id2_2 = header_dic.get('单位', "")
			
 
				                     id3 = header_dic.get('单价', "")
			
 
				                     id4 = header_dic.get('品牌', "")
			
 
				                     id5 = header_dic.get('规格', "")
			
@@ -2651,8 +2668,20 @@ class ProductAttributesPredictor():
 
				                         if id2 != "":
			
 
				                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
			
 
				                                 quantity = tds[id2]
			
 
				-                            else:
			
 
				-                                quantity = ""
			
 
				+                                quantity = re.sub('[()（）,，约]', '', quantity)
			
 
				+                                quantity = re.sub('[一壹]', '1', quantity)
			
 
				+                                ser = re.search('^(\d+\.?\d*)([㎡\w/]{,5})', quantity)
			
 
				+                                if ser:
			
 
				+                                    quantity = str(ser.group(1))
			
 
				+                                    quantity_unit = ser.group(2)
			
 
				+                                    if quantity_unit == "" and header_quan_unit != "":
			
 
				+                                        quantity_unit = header_quan_unit
			
 
				+                                else:
			
 
				+                                    quantity = ""
			
 
				+                                    quantity_unit = ""
			
 
				+                        if id2_2 != "":
			
 
				+                            if re.search('^\w{1,4}$', tds[id2_2]):
			
 
				+                                quantity_unit = tds[id2_2]
			
 
				                         if id3 != "":
			
 
				                             if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
			
 
				                                 _unitPrice = tds[id3]
			
@@ -2697,7 +2726,7 @@ class ProductAttributesPredictor():
 
				                             if len(unitPrice) > 15 or len(product)>100:  # 单价大于15位数或 产品名称长于100字
			
 
				                                 i += 1
			
 
				                                 continue
			
 
				-                            link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
			
 
				+                            link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
			
 
				                                                       'brand': brand[:50], 'specs':specs}
			
 
				                             if link not in product_link:
			
 
				                                 product_link.append(link)
			
@@ -4238,7 +4267,7 @@ class DistrictPredictor():
 
				             self.short2id = short2id
			
 
				             self.full2id = full2id
			
 
				 
			
 
				-    def predict(self, project_name, prem, title, list_articles, web_source_name = ""):
			
 
				+    def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
			
 
				         '''
			
 
				         先匹配 project_name+tenderee+tenderee_address， 如果缺少省或市 再匹配 title+content
			
 
				         :param project_name:
			
@@ -4271,8 +4300,8 @@ class DistrictPredictor():
 
				                     for _id in self.full2id[name]:
			
 
				                         area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
			
 
				                         # score_l.append([_id, score] + area)
			
 
				-                        w = self.dist_dic[_id]['权重']
			
 
				-                        score_l.append([_id, score + w] + area)
			
 
				+                        # w = self.dist_dic[_id]['权重']
			
 
				+                        score_l.append([_id, score + 1] + area) # 匹配全称的加1 ，不加权重，因为权重某些赋值不好
			
 
				 
			
 
				                 flag = 0
			
 
				                 for it in re.finditer(self.short_name, text):
			
@@ -4286,14 +4315,22 @@ class DistrictPredictor():
 
				                             area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
			
 
				                             if area[0] in ['2', '16', '20', '30']:
			
 
				                                 _type += 10
			
 
				+                            if w < 1 and it.end() < len(text) and text[it.end()] in ['省', '市', '县']: # 如果简称后面 有省市县权重改为1
			
 
				+                                w = 1
			
 
				                             score2 += w
			
 
				                             if _id not in id_set:
			
 
				                                 if _type == 20:
			
 
				                                     type_w = 3
			
 
				                                 elif _type == 30:
			
 
				-                                    type_w = 2
			
 
				+                                    if it.start()>3 and text[it.start()-1] == '市': # 城市后面 简称不能作为市
			
 
				+                                        type_w = 0
			
 
				+                                    else:
			
 
				+                                        type_w = 2
			
 
				                                 else:
			
 
				-                                    type_w = 1
			
 
				+                                    if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
			
 
				+                                        type_w = 2
			
 
				+                                    else:
			
 
				+                                        type_w = 1
			
 
				                                 id_set.add(_id)
			
 
				                                 score2 += w * type_w
			
 
				                             score_l.append([_id, score * w + score2] + area)
			
@@ -4344,29 +4381,53 @@ class DistrictPredictor():
 
				                3：地址直接在招标人后面 招标人：xxx,地址：xxx
			
 
				                4：招标、代理一起，两个地址一起 招标人：xxx， 代理人：xxx, 地址：xxx， 地址：xxx.
			
 
				             '''
			
 
				-            p3 = '(招标|采购)(人|单位)(信息：)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
			
 
				-            p4 = '(招标|采购)(人|单位)(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
			
 
				+            p3 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
			
 
				+            p4 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
			
 
				+            p5 = '(采购|招标)(人|单位)(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
			
 
				             if re.search(p3, text):
			
 
				                 return re.search(p3, text).group('addr')
			
 
				             elif re.search(p4, text):
			
 
				                 return re.search(p4, text).group('addr')
			
 
				+            elif re.search(p5, text):
			
 
				+                return re.search(p5, text).group('addr')
			
 
				             else:
			
 
				                 return ''
			
 
				 
			
 
				         def get_project_addr(text):
			
 
				-            p1 = '(项目|建设|工程|服务|交货|送货|收货|)(地址|地点|位置|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
			
 
				+            p1 = '(项目|建设|工程|服务|交货|送货|收货)(地址|地点|位置|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
			
 
				             if re.search(p1, text):
			
 
				                 return re.search(p1, text).group(0)
			
 
				             else:
			
 
				                 return ''
			
 
				 
			
 
				         def get_bid_addr(text):
			
 
				-            p2 = '(磋商|谈判|开标|投标|评标|(采购|招标)(人|单位)|报名|递交|评审|发售)(地址|地点|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
			
 
				+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
			
 
				             if re.search(p2, text):
			
 
				                 return re.search(p2, text).group(0)
			
 
				             else:
			
 
				                 return ''
			
 
				 
			
 
				+        def get_all_addr(list_entitys):
			
 
				+            tenderee_l = []
			
 
				+            other_roles = []
			
 
				+            addr_l = []
			
 
				+            for ent in list_entitys[0]:
			
 
				+                if ent.entity_type == 'location':
			
 
				+                    addr_l.append(ent.entity_text)
			
 
				+                elif ent.entity_type in ['org', 'company']:
			
 
				+                    if ent.label == 0:
			
 
				+                        tenderee_l.append(ent.entity_text)
			
 
				+                    else:
			
 
				+                        other_roles.append(ent.entity_text)
			
 
				+            return ' '.join(addr_l), ' '.join(tenderee_l), ' '.join(other_roles)
			
 
				+
			
 
				+        def get_title_addr(text):
			
 
				+            p1 = '(\w{2,8}[省市州区县][^\w]*)+'
			
 
				+            if re.search(p1, text):
			
 
				+                return re.search(p1, text).group(0)
			
 
				+            else:
			
 
				+                return ''
			
 
				+
			
 
				         if '##attachment##' in list_articles[0].content:
			
 
				             content, attachment = list_articles[0].content.split('##attachment##')
			
 
				             if len(content) < 200:
			
@@ -4384,9 +4445,13 @@ class DistrictPredictor():
 
				                 tenderee_address = role_addr
			
 
				 
			
 
				         if tenderee_address == "":
			
 
				-            bid_addr = get_bid_addr(content)
			
 
				-            if bid_addr != "":
			
 
				-                tenderee_address = bid_addr
			
 
				+            title_addr = get_title_addr(title)
			
 
				+            if title_addr != "":
			
 
				+                tenderee_address = title_addr
			
 
				+            else:
			
 
				+                bid_addr = get_bid_addr(content)
			
 
				+                if bid_addr != "":
			
 
				+                    tenderee_address = bid_addr
			
 
				 
			
 
				         project_name = str(project_name)
			
 
				         tenderee = str(tenderee)
			
@@ -4397,15 +4462,22 @@ class DistrictPredictor():
 
				         project_name = project_name.replace(tenderee, '')
			
 
				 
			
 
				         text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address)
			
 
				-        # print('text1:', text1)
			
 
				 
			
 
				         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
			
 
				         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
			
 
				+        # print('text1:', text1)
			
 
				         rs = get_area(text1, web_source_name)
			
 
				 
			
 
				         if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
			
 
				-            text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
			
 
				+            all_addr, tenderees, other_roles = get_all_addr(list_entitys)
			
 
				+            if tenderees != "":
			
 
				+                text2 = tenderees + " " + all_addr
			
 
				+                # print('所有地址：', all_addr)
			
 
				+            else:
			
 
				+                text2 = other_roles + " " + all_addr
			
 
				+                # text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
			
 
				             text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
			
 
				+            # print('text2:', text2)
			
 
				             rs2 = get_area(text2, web_source_name, not_in_content=False)
			
 
				             rs2['district']['is_in_text'] = True
			
 
				             if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':