Sfoglia il codice sorgente

特别数据源预处理新增

znj 3 anni fa
parent
commit
52f2f0f291

+ 57 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -1597,7 +1597,6 @@ def get_preprocessed(articles, useselffool=False):
 
 def special_treatment(sourceContent, web_source_no):
     if web_source_no == 'DX000202-1':
-         sourceContent
          ser = re.search('中标供应商及中标金额:【((\w{5,20}-[\d,.]+,)+)】', sourceContent)
          if ser:
              new = ""
@@ -1608,6 +1607,59 @@ def special_treatment(sourceContent, web_source_no):
                      role, money = it.split('-')
                      new += '标段%d, 中标供应商: ' % (i + 1) + role + ',中标金额:' + money + '。'
              sourceContent = sourceContent.replace(ser.group(0), new, 1)
+    elif web_source_no == '00753-14':
+        pcontent = sourceContent.find("div", id="pcontent")
+        pcontent = pcontent.find_all(recursive=False)[0]
+        first_table = None
+        for idx in range(len(pcontent.find_all(recursive=False))):
+            t_part = pcontent.find_all(recursive=False)[idx]
+            if t_part.name != "table":
+                break
+            if idx == 0:
+                first_table = t_part
+            else:
+                for _tr in t_part.find("tbody").find_all(recursive=False):
+                    first_table.find("tbody").append(_tr)
+                t_part.clear()
+    elif web_source_no == 'DX008357-11':
+        pcontent = sourceContent.find("div", id="pcontent")
+        pcontent = pcontent.find_all(recursive=False)[0]
+        error_table = []
+        is_error_table = False
+        for part in pcontent.find_all(recursive=False):
+            if is_error_table:
+                if part.name == "table":
+                    error_table.append(part)
+                else:
+                    break
+            if part.name == "div" and part.get_text(strip=True) == "中标候选单位:":
+                is_error_table = True
+        first_table = None
+        for idx in range(len(error_table)):
+            t_part = error_table[idx]
+            # if t_part.name != "table":
+            #     break
+            if idx == 0:
+                for _tr in t_part.find("tbody").find_all(recursive=False):
+                    if _tr.get_text(strip=True) == "":
+                        _tr.decompose()
+                first_table = t_part
+            else:
+                for _tr in t_part.find("tbody").find_all(recursive=False):
+                    if _tr.get_text(strip=True) != "":
+                        first_table.find("tbody").append(_tr)
+                t_part.clear()
+    elif web_source_no == '18021-2':
+        pcontent = sourceContent.find("div", id="pcontent")
+        td = pcontent.find_all("td")
+        for _td in td:
+            if str(_td.string).strip() == "报价金额":
+                _td.string = "单价"
+    elif web_source_no == '13740-2':
+        # “xxx成为成交供应商”
+        re_match = re.search("[^,。]+成为[^,。]*成交供应商", sourceContent)
+        if re_match:
+            sourceContent = sourceContent.replace(re_match.group(), "成交人:" + re_match.group(), sourceContent)
     elif web_source_no == '03786-10':
         ser1 = re.search('中标价:([\d,.]+)', sourceContent)
         ser2 = re.search('合同金额[((]万元[))]:([\d,.]+)', sourceContent)
@@ -1681,7 +1733,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         start_time = time.time()
         # article_processed = tableToText(BeautifulSoup(sourceContent,"lxml"))
         article_processed = BeautifulSoup(sourceContent,"lxml")
-        # article_processed = preprocessed_html(article_processed,"")
+        '''特别数据源对 BeautifulSoup(html) 做特别修改'''
+        if web_source_no in ["00753-14","DX008357-11","18021-2"]:
+            article_processed = special_treatment(article_processed, web_source_no)
         for _soup in article_processed.descendants:
             # 识别无标签文本,添加<p>标签
             if not _soup.name and not _soup.parent.string and _soup.string.strip()!="":
@@ -1696,7 +1750,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
         '''特别数据源对 预处理后文本 做特别修改'''
-        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7']:
+        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2"]:
             article_processed = special_treatment(article_processed, web_source_no)
 
         # 提取bidway

+ 2 - 2
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -69,7 +69,7 @@ class MyEncoder(json.JSONEncoder):
 
 
 def predict(doc_id,text):
-    list_articles,list_sentences,list_entitys,list_outlines,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","",""]],useselffool=True)
+    list_articles,list_sentences,list_entitys,list_outlines,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","","",""]],useselffool=True)
     for articles in list_articles:
         print('预处理后文本信息')
         print(articles.content)
@@ -92,7 +92,7 @@ def predict(doc_id,text):
     #             print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
     #             pass
     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys)
+    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys, codeName)
     # print("epcPredict")
     epcPredict.predict(list_sentences,list_entitys)