Parcourir la source

修复站源名称或编号为None报错;字数统计;产权、拍卖优先级调整;

lsm il y a 10 mois
Parent
commit
dc7bd41859
2 fichiers modifiés avec 16 ajouts et 11 suppressions
  1. 12 7
      BiddingKG/dl/interface/extract.py
  2. 4 4
      BiddingKG/dl/interface/predictor.py

+ 12 - 7
BiddingKG/dl/interface/extract.py

@@ -250,6 +250,10 @@ def repair_entity(prem,district_dict,list_articles):
 
 def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
     cost_time = dict()
+    if web_source_no == None:
+        web_source_no = ''
+    if web_source_name == None:
+        web_source_name = ''
 
     start_time = time.time()
     log("start process doc %s"%(str(doc_id)))
@@ -446,7 +450,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-09-26'}
+    version_date = {'version_date': '2024-09-29'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -503,12 +507,13 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     # 投标地址
     data_res['addr_dic']['addr_bidsend'] = addr_bidsend_text
     # 字数
-    if '##attachment##' in list_articles[0].content:
-        text_main, text_attn = list_articles[0].content.split('##attachment##')
-    else:
-        text_main = list_articles[0].content
-        text_attn = ""
-    data_res['word_count'] = {'正文': len(text_main), '附件': len(text_attn)}
+    text_main, text_attn = 0, 0
+    for sentence in list_sentences[0]:
+        if sentence.in_attachment:
+            text_attn += len(sentence.sentence_text)
+        else:
+            text_main += len(sentence.sentence_text)
+    data_res['word_count'] = {'正文': text_main, '附件': text_attn}
     # 限制产品数量
     data_res['product'] = data_res['product'][:500]
     data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]

+ 4 - 4
BiddingKG/dl/interface/predictor.py

@@ -4328,14 +4328,14 @@ class DocChannel():
               if re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title):
                   return '采招数据', re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title).group(0)
               return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0)
-          elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
-              if re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title):
-                  return '采招数据', re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title).group(0)
-              return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
           elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text):
               if re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title):
                   return '采招数据', re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title).group(0)
               return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0)
+          elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
+              if re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title):
+                  return '采招数据', re.search(self.title_type_dic['采招数据'], text.strip().split(' ')[0] + title).group(0)
+              return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
           elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text):
               return '采招数据', (
                           re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group(