|
@@ -1597,7 +1597,6 @@ def get_preprocessed(articles, useselffool=False):
|
|
|
|
|
|
def special_treatment(sourceContent, web_source_no):
|
|
|
if web_source_no == 'DX000202-1':
|
|
|
- sourceContent
|
|
|
ser = re.search('中标供应商及中标金额:【((\w{5,20}-[\d,.]+,)+)】', sourceContent)
|
|
|
if ser:
|
|
|
new = ""
|
|
@@ -1608,6 +1607,59 @@ def special_treatment(sourceContent, web_source_no):
|
|
|
role, money = it.split('-')
|
|
|
new += '标段%d, 中标供应商: ' % (i + 1) + role + ',中标金额:' + money + '。'
|
|
|
sourceContent = sourceContent.replace(ser.group(0), new, 1)
|
|
|
+ elif web_source_no == '00753-14':
|
|
|
+ pcontent = sourceContent.find("div", id="pcontent")
|
|
|
+ pcontent = pcontent.find_all(recursive=False)[0]
|
|
|
+ first_table = None
|
|
|
+ for idx in range(len(pcontent.find_all(recursive=False))):
|
|
|
+ t_part = pcontent.find_all(recursive=False)[idx]
|
|
|
+ if t_part.name != "table":
|
|
|
+ break
|
|
|
+ if idx == 0:
|
|
|
+ first_table = t_part
|
|
|
+ else:
|
|
|
+ for _tr in t_part.find("tbody").find_all(recursive=False):
|
|
|
+ first_table.find("tbody").append(_tr)
|
|
|
+ t_part.clear()
|
|
|
+ elif web_source_no == 'DX008357-11':
|
|
|
+ pcontent = sourceContent.find("div", id="pcontent")
|
|
|
+ pcontent = pcontent.find_all(recursive=False)[0]
|
|
|
+ error_table = []
|
|
|
+ is_error_table = False
|
|
|
+ for part in pcontent.find_all(recursive=False):
|
|
|
+ if is_error_table:
|
|
|
+ if part.name == "table":
|
|
|
+ error_table.append(part)
|
|
|
+ else:
|
|
|
+ break
|
|
|
+ if part.name == "div" and part.get_text(strip=True) == "中标候选单位:":
|
|
|
+ is_error_table = True
|
|
|
+ first_table = None
|
|
|
+ for idx in range(len(error_table)):
|
|
|
+ t_part = error_table[idx]
|
|
|
+ # if t_part.name != "table":
|
|
|
+ # break
|
|
|
+ if idx == 0:
|
|
|
+ for _tr in t_part.find("tbody").find_all(recursive=False):
|
|
|
+ if _tr.get_text(strip=True) == "":
|
|
|
+ _tr.decompose()
|
|
|
+ first_table = t_part
|
|
|
+ else:
|
|
|
+ for _tr in t_part.find("tbody").find_all(recursive=False):
|
|
|
+ if _tr.get_text(strip=True) != "":
|
|
|
+ first_table.find("tbody").append(_tr)
|
|
|
+ t_part.clear()
|
|
|
+ elif web_source_no == '18021-2':
|
|
|
+ pcontent = sourceContent.find("div", id="pcontent")
|
|
|
+ td = pcontent.find_all("td")
|
|
|
+ for _td in td:
|
|
|
+ if str(_td.string).strip() == "报价金额":
|
|
|
+ _td.string = "单价"
|
|
|
+ elif web_source_no == '13740-2':
|
|
|
+ # “xxx成为成交供应商”
|
|
|
+ re_match = re.search("[^,。]+成为[^,。]*成交供应商", sourceContent)
|
|
|
+ if re_match:
|
|
|
+ sourceContent = sourceContent.replace(re_match.group(), "成交人:" + re_match.group(), sourceContent)
|
|
|
elif web_source_no == '03786-10':
|
|
|
ser1 = re.search('中标价:([\d,.]+)', sourceContent)
|
|
|
ser2 = re.search('合同金额[((]万元[))]:([\d,.]+)', sourceContent)
|
|
@@ -1681,7 +1733,9 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
|
start_time = time.time()
|
|
|
# article_processed = tableToText(BeautifulSoup(sourceContent,"lxml"))
|
|
|
article_processed = BeautifulSoup(sourceContent,"lxml")
|
|
|
- # article_processed = preprocessed_html(article_processed,"")
|
|
|
+ '''特别数据源对 BeautifulSoup(html) 做特别修改'''
|
|
|
+ if web_source_no in ["00753-14","DX008357-11","18021-2"]:
|
|
|
+ article_processed = special_treatment(article_processed, web_source_no)
|
|
|
for _soup in article_processed.descendants:
|
|
|
# 识别无标签文本,添加<p>标签
|
|
|
if not _soup.name and not _soup.parent.string and _soup.string.strip()!="":
|
|
@@ -1696,7 +1750,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
|
article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
|
|
|
article_processed = article_processed.replace('成交工程价款', '成交工程价') # 2021/12/21 修正为中标价
|
|
|
'''特别数据源对 预处理后文本 做特别修改'''
|
|
|
- if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7']:
|
|
|
+ if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2"]:
|
|
|
article_processed = special_treatment(article_processed, web_source_no)
|
|
|
|
|
|
# 提取bidway
|