Explorar el Código

Merge remote-tracking branch 'origin/master'

luojiehua hace 2 años
padre
commit
e5e797090e
Se han modificado 1 ficheros con 3 adiciones y 1 borrados
  1. 3 1
      BiddingKG/dl/interface/Preprocessing.py

+ 3 - 1
BiddingKG/dl/interface/Preprocessing.py

@@ -1960,7 +1960,7 @@ def del_tabel_achievement(soup):
     if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
         return None
     p1 = '中标(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
-
+    '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除'''
     for tag in soup.find_all('table'):
         pre_text = tag.findPreviousSibling().text.strip() if tag.findPreviousSibling() != None else ""
         tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else ""
@@ -1977,10 +1977,12 @@ def del_tabel_achievement(soup):
                     pre_tag = tag.findPreviousSibling().extract()
                     del_tag = tag.extract()
                     # print('删除表格业绩内容', pre_tag.text + del_tag.text)
+                    break
         elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100:
             del_tag = tag.extract()
             # print('删除表格业绩内容', del_tag.text)
     del_trs = []
+    '''删除表格某些行公布的业绩信息'''
     for tag in soup.find_all('table'):
         text = tag.text
         if re.search('业绩', text) == None: