3 年之前 · 52f2f0f291
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1597,7 +1597,6 @@ def get_preprocessed(articles, useselffool=False):
 
				 
			
 
				 def special_treatment(sourceContent, web_source_no):
			
 
				     if web_source_no == 'DX000202-1':
			
 
				-         sourceContent
			
 
				          ser = re.search('中标供应商及中标金额：【((\w{5,20}-[\d,.]+,)+)】', sourceContent)
			
 
				          if ser:
			
 
				              new = ""
			
@@ -1608,6 +1607,59 @@ def special_treatment(sourceContent, web_source_no):
 
				                      role, money = it.split('-')
			
 
				                      new += '标段%d, 中标供应商: ' % (i + 1) + role + '，中标金额：' + money + '。'
			
 
				              sourceContent = sourceContent.replace(ser.group(0), new, 1)
			
 
				+    elif web_source_no == '00753-14':
			
 
				+        pcontent = sourceContent.find("div", id="pcontent")
			
 
				+        pcontent = pcontent.find_all(recursive=False)[0]
			
 
				+        first_table = None
			
 
				+        for idx in range(len(pcontent.find_all(recursive=False))):
			
 
				+            t_part = pcontent.find_all(recursive=False)[idx]
			
 
				+            if t_part.name != "table":
			
 
				+                break
			
 
				+            if idx == 0:
			
 
				+                first_table = t_part
			
 
				+            else:
			
 
				+                for _tr in t_part.find("tbody").find_all(recursive=False):
			
 
				+                    first_table.find("tbody").append(_tr)
			
 
				+                t_part.clear()
			
 
				+    elif web_source_no == 'DX008357-11':
			
 
				+        pcontent = sourceContent.find("div", id="pcontent")
			
 
				+        pcontent = pcontent.find_all(recursive=False)[0]
			
 
				+        error_table = []
			
 
				+        is_error_table = False
			
 
				+        for part in pcontent.find_all(recursive=False):
			
 
				+            if is_error_table:
			
 
				+                if part.name == "table":
			
 
				+                    error_table.append(part)
			
 
				+                else:
			
 
				+                    break
			
 
				+            if part.name == "div" and part.get_text(strip=True) == "中标候选单位：":
			
 
				+                is_error_table = True
			
 
				+        first_table = None
			
 
				+        for idx in range(len(error_table)):
			
 
				+            t_part = error_table[idx]
			
 
				+            # if t_part.name != "table":
			
 
				+            #     break
			
 
				+            if idx == 0:
			
 
				+                for _tr in t_part.find("tbody").find_all(recursive=False):
			
 
				+                    if _tr.get_text(strip=True) == "":
			
 
				+                        _tr.decompose()
			
 
				+                first_table = t_part
			
 
				+            else:
			
 
				+                for _tr in t_part.find("tbody").find_all(recursive=False):
			
 
				+                    if _tr.get_text(strip=True) != "":
			
 
				+                        first_table.find("tbody").append(_tr)
			
 
				+                t_part.clear()
			
 
				+    elif web_source_no == '18021-2':
			
 
				+        pcontent = sourceContent.find("div", id="pcontent")
			
 
				+        td = pcontent.find_all("td")
			
 
				+        for _td in td:
			
 
				+            if str(_td.string).strip() == "报价金额":
			
 
				+                _td.string = "单价"
			
 
				+    elif web_source_no == '13740-2':
			
 
				+        # “xxx成为成交供应商”
			
 
				+        re_match = re.search("[^，。]+成为[^，。]*成交供应商", sourceContent)
			
 
				+        if re_match:
			
 
				+            sourceContent = sourceContent.replace(re_match.group(), "成交人：" + re_match.group(), sourceContent)
			
 
				     elif web_source_no == '03786-10':
			
 
				         ser1 = re.search('中标价：([\d,.]+)', sourceContent)
			
 
				         ser2 = re.search('合同金额[(（]万元[)）]：([\d,.]+)', sourceContent)
			
@@ -1681,7 +1733,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         start_time = time.time()
			
 
				         # article_processed = tableToText(BeautifulSoup(sourceContent,"lxml"))
			
 
				         article_processed = BeautifulSoup(sourceContent,"lxml")
			
 
				-        # article_processed = preprocessed_html(article_processed,"")
			
 
				+        '''特别数据源对 BeautifulSoup(html) 做特别修改'''
			
 
				+        if web_source_no in ["00753-14","DX008357-11","18021-2"]:
			
 
				+            article_processed = special_treatment(article_processed, web_source_no)
			
 
				         for _soup in article_processed.descendants:
			
 
				             # 识别无标签文本，添加<p>标签
			
 
				             if not _soup.name and not _soup.parent.string and _soup.string.strip()!="":
			
@@ -1696,7 +1750,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
			
 
				         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
			
 
				         '''特别数据源对 预处理后文本 做特别修改'''
			
 
				-        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7']:
			
 
				+        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2"]:
			
 
				             article_processed = special_treatment(article_processed, web_source_no)
			
 
				 
			
 
				         # 提取bidway
			
--- a/BiddingKG/dl/test/测试整个要素提取流程.py
+++ b/BiddingKG/dl/test/测试整个要素提取流程.py
@@ -69,7 +69,7 @@ class MyEncoder(json.JSONEncoder):
 
				 
			
 
				 
			
 
				 def predict(doc_id,text):
			
 
				-    list_articles,list_sentences,list_entitys,list_outlines,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","",""]],useselffool=True)
			
 
				+    list_articles,list_sentences,list_entitys,list_outlines,_ = Preprocessing.get_preprocessed([[doc_id,text,"","","","",""]],useselffool=True)
			
 
				     for articles in list_articles:
			
 
				         print('预处理后文本信息')
			
 
				         print(articles.content)
			
@@ -92,7 +92,7 @@ def predict(doc_id,text):
 
				     #             print(entity.entity_text, entity.pointer_person,entity.label,entity.values)
			
 
				     #             pass
			
 
				     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
			
 
				-    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys)
			
 
				+    predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys, codeName)
			
 
				     # print("epcPredict")
			
 
				     epcPredict.predict(list_sentences,list_entitys)