2 жил өмнө · c7a94a4b9d
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -2210,6 +2210,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式：名称)', '供应商名称', article_processed)  # 18889217, 84422177
			
 
				         article_processed = re.sub('，最高有效报价者：', '，中标人名称：', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
			
 
				         article_processed = re.sub('，最高有效报价：', '，投标报价：', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
			
 
				+        article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
			
 
				         ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?：(?P<tenderee>[\w（）]{4,25}(/[\w（）]{4,25})?)/(?P<agency>[\w（）]{4,25})[，。]', article_processed)
			
 
				         if ser:
			
 
				             article_processed = article_processed.replace(ser.group(0), '采购人名称：%s，采购代理机构名称：%s，' % (ser.group('tenderee'), ser.group('agency')))
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -327,7 +327,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2023-07-04'}
			
 
				+    version_date = {'version_date': '2023-07-18'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
			
 
				 
			
 
				     '''最终检查修正招标、中标金额'''
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -3400,15 +3400,17 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
 
				             content += attachment
			
 
				     else:
			
 
				         content = list_articles[0].content
			
 
				-    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同)）?(总?金额|[报总]?价)：', content) == None: # 只有一个中标角色且没有明确中标金额表达的
			
 
				+    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同|投标)）?(总?金额|[报总]?价)：', content) == None: # 只有一个中标角色且没有明确中标金额表达的
			
 
				         if total_product_money>0 and total_product_money<5000000000:
			
 
				             for value in prem[0]['prem'].values():
			
 
				+                ree_money = float(value['tendereeMoney'])
			
 
				                 for l in value['roleList']:
			
 
				                     try:
			
 
				                         # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
			
 
				                         #     l[2] = total_product_money
			
 
				                         #     log('修改中标金额为所有产品总金额')
			
 
				-                        if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money/10:
			
 
				+                        # if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money']) == 0 and float(l["role_money"]['money'])<total_product_money/10:
			
 
				+                        if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money']) == 0 or float(l["role_money"]['money'])<ree_money/2): # 改为小于一半招标金额或为0时替换为合计金额
			
 
				                             l["role_money"]['money'] = total_product_money
			
 
				                             # print('修改中标金额为所有产品总金额')
			
 
				                     except Exception as e:
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -458,6 +458,8 @@ class CodeNamePredict():
 
				                         _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
			
 
				                         if len(_name)>200: # 避免模型预测类似 202750503 这种很长重复字很多的错误项目名称
			
 
				                             continue
			
 
				+                        elif '公司：你单位在' in _name: # 避免类似 339900030 这种作为项目名称，导致中标角色作为招标角色
			
 
				+                            continue
			
 
				 
			
 
				                         #add name to entitys
			
 
				                         _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
			
@@ -2912,7 +2914,7 @@ class ProductAttributesPredictor():
 
				         :return:公告表格内 产品、数量、单价、品牌、规格 ，表头，表头列等信息
			
 
				         '''
			
 
				 
			
 
				-
			
 
				+        html = html.replace('<br>', '\n')
			
 
				         soup = BeautifulSoup(html, 'lxml')
			
 
				         # flag_yx = True if re.search('采购意向', html) else False
			
 
				         flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False
			
@@ -2924,6 +2926,7 @@ class ProductAttributesPredictor():
 
				         demand_link = []
			
 
				         product_set = set()
			
 
				         total_product_money = 0
			
 
				+        unit_price_list = []
			
 
				         for i in range(len(tables)-1, -1, -1):
			
 
				             table = tables[i]
			
 
				             if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
			
@@ -3069,7 +3072,13 @@ class ProductAttributesPredictor():
 
				                     if product != "":
			
 
				                         if id2 != "":
			
 
				                             if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
			
 
				+                                if re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7}，?)$)|万?元', tds[id2]):
			
 
				+                                    i += 1
			
 
				+                                    continue
			
 
				                                 quantity = tds[id2]
			
 
				+                            elif re.search('\w', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
			
 
				+                                i += 1
			
 
				+                                continue
			
 
				                         if id2_2 != "":
			
 
				                             if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None:
			
 
				                                 quantity_unit = tds[id2_2]
			
@@ -3078,6 +3087,9 @@ class ProductAttributesPredictor():
 
				                                 unitPrice = tds[id3]
			
 
				                             elif re.search('^[\d,.亿万元人民币欧美日金额：（）();；、，\n]+$', tds[id3].strip()):
			
 
				                                 unitPrice = tds[id3]
			
 
				+                            elif re.search('\w', tds[id3]) and re.search('^详见|^略', tds[id3])==None:
			
 
				+                                i += 1
			
 
				+                                continue
			
 
				                         if id4 != "":
			
 
				                             if re.search('\w', tds[id4]):
			
 
				                                 brand = tds[id4]
			
@@ -3109,6 +3121,9 @@ class ProductAttributesPredictor():
 
				                                 total_price = tds[id9]
			
 
				                             elif re.search('^[\d,.亿万元人民币欧美日金额：（）();；、，\n]+$', tds[id9].strip()):
			
 
				                                 total_price = tds[id9]
			
 
				+                            elif re.search('\w', tds[id9]) and re.search('^详见|^略', tds[id9])==None:
			
 
				+                                i += 1
			
 
				+                                continue
			
 
				                         if id10 != "":
			
 
				                             parameter = tds[id10][:500]
			
 
				                             if re.match('^详见|^略$', parameter.strip()):
			
@@ -3200,6 +3215,8 @@ class ProductAttributesPredictor():
 
				                                     product_link.append(link)
			
 
				                                     if link['unitPrice'] != "" and link['quantity'] != '':
			
 
				                                         try:
			
 
				+                                            if link['unitPrice']:
			
 
				+                                                unit_price_list.append(link['unitPrice'])
			
 
				                                             total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0
			
 
				                                         except:
			
 
				                                             log('产品属性单价数量相乘出错, 单价： %s, 数量： %s'%(link['unitPrice'], link['quantity']))
			
@@ -3227,6 +3244,8 @@ class ProductAttributesPredictor():
 
				             demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
			
 
				         else:
			
 
				             demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
			
 
				+        if len(unit_price_list)>0 and len(set(unit_price_list))/len(unit_price_list)<=0.5:  # 2023/7/18 如果单价重复率高不算总产品价避免错误
			
 
				+            total_product_money = 0
			
 
				         return [attr_dic, demand_dic], total_product_money
			
 
				 
			
 
				     def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
			
@@ -5116,6 +5135,8 @@ class TablePremExtractor(object):
 
				             flag = True
			
 
				             for i in range(len(td_list)) :
			
 
				                 text = td_list[i]
			
 
				+                if text == '备选中标人':
			
 
				+                    text = '第二候选人'
			
 
				                 if len(text) > 15: # 长度大于15 不进行表头匹配
			
 
				                     continue
			
 
				                 if re.search('未(中标|成交)原因', text):  # 不提取此种表格
			
--- a/BiddingKG/readme/start.md
+++ b/BiddingKG/readme/start.md
@@ -9,6 +9,7 @@ cd /data/python
 
				 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
			
 
				 #启动接口
			
 
				 nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
			
 
				+nohup gunicorn --workers 3 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 192.168.2.102:15030 run_extract_server:app > extract.log 2>&1 &
			
 
				 #nohup /data/anaconda3/envs/py37/bin/python run_extract_server.py >> extract.log port=15030 worker=14 &
			
 
				 
			
 
				 #19022启动要素提取接口
			
--- a/BiddingKG/run_extract_server.py
+++ b/BiddingKG/run_extract_server.py
@@ -81,7 +81,7 @@ def run_thread(data,list_result):
 
				     web_source_no = data.get("web_source_no","")
			
 
				     web_source_name = data.get("web_source_name","")
			
 
				     original_docchannel = data.get("original_docchannel","")
			
 
				-    print("web_source_name:",web_source_name)
			
 
				+    # print("web_source_name:",web_source_name)
			
 
				     is_fail = False
			
 
				     try:
			
 
				         if _content!="":
			
@@ -98,7 +98,7 @@ def run_thread(data,list_result):
 
				     # 以json形式返回结果
			
 
				     #_resp = json.dumps(data_res,cls=MyEncoder)
			
 
				     #log(str(data["flag"])+str(data))
			
 
				-    log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
			
 
				+    # log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
			
 
				     list_result.append(data_res)
			
 
				     if is_fail:
			
 
				         list_result.append(is_fail)