Эх сурвалжийг харах

优化产品属性提取;优化角色提取及中标价表格产品价合计替换逻辑

lsm 1 жил өмнө
parent
commit
c7a94a4b9d

+ 1 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -2210,6 +2210,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式:名称)', '供应商名称', article_processed)  # 18889217, 84422177
         article_processed = re.sub(',最高有效报价者:', ',中标人名称:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         article_processed = re.sub(',最高有效报价:', ',投标报价:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
+        article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
         ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
         if ser:
             article_processed = article_processed.replace(ser.group(0), '采购人名称:%s,采购代理机构名称:%s,' % (ser.group('tenderee'), ser.group('agency')))

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -327,7 +327,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-07-04'}
+    version_date = {'version_date': '2023-07-18'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''

+ 4 - 2
BiddingKG/dl/interface/getAttributes.py

@@ -3400,15 +3400,17 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
             content += attachment
     else:
         content = list_articles[0].content
-    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的
+    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同|投标))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的
         if total_product_money>0 and total_product_money<5000000000:
             for value in prem[0]['prem'].values():
+                ree_money = float(value['tendereeMoney'])
                 for l in value['roleList']:
                     try:
                         # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
                         #     l[2] = total_product_money
                         #     log('修改中标金额为所有产品总金额')
-                        if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money/10:
+                        # if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) == 0 and float(l["role_money"]['money'])<total_product_money/10:
+                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or float(l["role_money"]['money'])<ree_money/2): # 改为小于一半招标金额或为0时替换为合计金额
                             l["role_money"]['money'] = total_product_money
                             # print('修改中标金额为所有产品总金额')
                     except Exception as e:

+ 22 - 1
BiddingKG/dl/interface/predictor.py

@@ -458,6 +458,8 @@ class CodeNamePredict():
                         _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
                         if len(_name)>200: # 避免模型预测类似 202750503 这种很长重复字很多的错误项目名称
                             continue
+                        elif '公司:你单位在' in _name: # 避免类似 339900030 这种作为项目名称,导致中标角色作为招标角色
+                            continue
 
                         #add name to entitys
                         _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
@@ -2912,7 +2914,7 @@ class ProductAttributesPredictor():
         :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息
         '''
 
-
+        html = html.replace('<br>', '\n')
         soup = BeautifulSoup(html, 'lxml')
         # flag_yx = True if re.search('采购意向', html) else False
         flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False
@@ -2924,6 +2926,7 @@ class ProductAttributesPredictor():
         demand_link = []
         product_set = set()
         total_product_money = 0
+        unit_price_list = []
         for i in range(len(tables)-1, -1, -1):
             table = tables[i]
             if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
@@ -3069,7 +3072,13 @@ class ProductAttributesPredictor():
                     if product != "":
                         if id2 != "":
                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
+                                if re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|万?元', tds[id2]):
+                                    i += 1
+                                    continue
                                 quantity = tds[id2]
+                            elif re.search('\w', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
+                                i += 1
+                                continue
                         if id2_2 != "":
                             if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None:
                                 quantity_unit = tds[id2_2]
@@ -3078,6 +3087,9 @@ class ProductAttributesPredictor():
                                 unitPrice = tds[id3]
                             elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id3].strip()):
                                 unitPrice = tds[id3]
+                            elif re.search('\w', tds[id3]) and re.search('^详见|^略', tds[id3])==None:
+                                i += 1
+                                continue
                         if id4 != "":
                             if re.search('\w', tds[id4]):
                                 brand = tds[id4]
@@ -3109,6 +3121,9 @@ class ProductAttributesPredictor():
                                 total_price = tds[id9]
                             elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id9].strip()):
                                 total_price = tds[id9]
+                            elif re.search('\w', tds[id9]) and re.search('^详见|^略', tds[id9])==None:
+                                i += 1
+                                continue
                         if id10 != "":
                             parameter = tds[id10][:500]
                             if re.match('^详见|^略$', parameter.strip()):
@@ -3200,6 +3215,8 @@ class ProductAttributesPredictor():
                                     product_link.append(link)
                                     if link['unitPrice'] != "" and link['quantity'] != '':
                                         try:
+                                            if link['unitPrice']:
+                                                unit_price_list.append(link['unitPrice'])
                                             total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0
                                         except:
                                             log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
@@ -3227,6 +3244,8 @@ class ProductAttributesPredictor():
             demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
         else:
             demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
+        if len(unit_price_list)>0 and len(set(unit_price_list))/len(unit_price_list)<=0.5:  # 2023/7/18 如果单价重复率高不算总产品价避免错误
+            total_product_money = 0
         return [attr_dic, demand_dic], total_product_money
 
     def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
@@ -5116,6 +5135,8 @@ class TablePremExtractor(object):
             flag = True
             for i in range(len(td_list)) :
                 text = td_list[i]
+                if text == '备选中标人':
+                    text = '第二候选人'
                 if len(text) > 15: # 长度大于15 不进行表头匹配
                     continue
                 if re.search('未(中标|成交)原因', text):  # 不提取此种表格

+ 1 - 0
BiddingKG/readme/start.md

@@ -9,6 +9,7 @@ cd /data/python
 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
 #启动接口
 nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+nohup gunicorn --workers 3 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 192.168.2.102:15030 run_extract_server:app > extract.log 2>&1 &
 #nohup /data/anaconda3/envs/py37/bin/python run_extract_server.py >> extract.log port=15030 worker=14 &
 
 #19022启动要素提取接口

+ 2 - 2
BiddingKG/run_extract_server.py

@@ -81,7 +81,7 @@ def run_thread(data,list_result):
     web_source_no = data.get("web_source_no","")
     web_source_name = data.get("web_source_name","")
     original_docchannel = data.get("original_docchannel","")
-    print("web_source_name:",web_source_name)
+    # print("web_source_name:",web_source_name)
     is_fail = False
     try:
         if _content!="":
@@ -98,7 +98,7 @@ def run_thread(data,list_result):
     # 以json形式返回结果
     #_resp = json.dumps(data_res,cls=MyEncoder)
     #log(str(data["flag"])+str(data))
-    log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
+    # log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
     list_result.append(data_res)
     if is_fail:
         list_result.append(is_fail)