Sfoglia il codice sorgente

预处理错误修复

znj 3 anni fa
parent
commit
51d8ecafd1

+ 21 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -1669,7 +1669,13 @@ def special_treatment(sourceContent, web_source_no):
                      new += '标段%d, 中标供应商: ' % (i + 1) + role + ',中标金额:' + money + '。'
              sourceContent = sourceContent.replace(ser.group(0), new, 1)
     elif web_source_no == '00753-14':
-        pcontent = sourceContent.find("div", id="pcontent")
+        body = sourceContent.find("body")
+        body_child = body.find_all(recursive=False)
+        pcontent = body
+        if 'id' in body_child[0].attrs:
+            if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
+                pcontent = body_child[0]
+        # pcontent = sourceContent.find("div", id="pcontent")
         pcontent = pcontent.find_all(recursive=False)[0]
         first_table = None
         for idx in range(len(pcontent.find_all(recursive=False))):
@@ -1683,7 +1689,13 @@ def special_treatment(sourceContent, web_source_no):
                     first_table.find("tbody").append(_tr)
                 t_part.clear()
     elif web_source_no == 'DX008357-11':
-        pcontent = sourceContent.find("div", id="pcontent")
+        body = sourceContent.find("body")
+        body_child = body.find_all(recursive=False)
+        pcontent = body
+        if 'id' in body_child[0].attrs:
+            if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
+                pcontent = body_child[0]
+        # pcontent = sourceContent.find("div", id="pcontent")
         pcontent = pcontent.find_all(recursive=False)[0]
         error_table = []
         is_error_table = False
@@ -1711,7 +1723,13 @@ def special_treatment(sourceContent, web_source_no):
                         first_table.find("tbody").append(_tr)
                 t_part.clear()
     elif web_source_no == '18021-2':
-        pcontent = sourceContent.find("div", id="pcontent")
+        body = sourceContent.find("body")
+        body_child = body.find_all(recursive=False)
+        pcontent = body
+        if 'id' in body_child[0].attrs:
+            if len(body_child) <= 2 and body_child[0]['id'] == 'pcontent':
+                pcontent = body_child[0]
+        # pcontent = sourceContent.find("div", id="pcontent")
         td = pcontent.find_all("td")
         for _td in td:
             if str(_td.string).strip() == "报价金额":

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -210,7 +210,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',original_docchann
     if len(product_attrs[1]['demand_info']['data'])>0:
         for d in product_attrs[1]['demand_info']['data']:
             for product in set(prem[0]['product']):
-                if product in d['project_name']:
+                if product in d['project_name'] and product not in d['product']:
                     d['product'].append(product)  #把产品在项目名称中的添加进需求要素中
 
     '''修正采购公告表格形式多种采购产品中标价格'''