|
@@ -43,7 +43,7 @@ class MyEncoder(json.JSONEncoder):
|
|
return obj
|
|
return obj
|
|
return json.JSONEncoder.default(self, obj)
|
|
return json.JSONEncoder.default(self, obj)
|
|
|
|
|
|
-def extractCount(extract_dict):
|
|
|
|
|
|
+def extractCount(extract_dict,page_attachments):
|
|
# time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
|
|
# time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
|
|
|
|
|
|
if len(extract_dict):
|
|
if len(extract_dict):
|
|
@@ -113,6 +113,27 @@ def extractCount(extract_dict):
|
|
extract_count += 1
|
|
extract_count += 1
|
|
if project_name!="":
|
|
if project_name!="":
|
|
extract_count += 1
|
|
extract_count += 1
|
|
|
|
+
|
|
|
|
+ if page_attachments is not None and page_attachments!='':
|
|
|
|
+ try:
|
|
|
|
+ _attachments = json.loads(page_attachments,"")
|
|
|
|
+ has_zhaobiao = False
|
|
|
|
+ has_qingdan = False
|
|
|
|
+ if len(_attachments)>0:
|
|
|
|
+ for _atta in _attachments:
|
|
|
|
+ classification = _atta.get("classification","")
|
|
|
|
+ if str(classification)=='招标文件':
|
|
|
|
+ has_zhaobiao = True
|
|
|
|
+ if str(classification)=='采购清单':
|
|
|
|
+ has_qingdan = True
|
|
|
|
+ if has_zhaobiao:
|
|
|
|
+ extract_count += 3
|
|
|
|
+ if has_qingdan:
|
|
|
|
+ extract_count += 2
|
|
|
|
+ except Exception as e:
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+
|
|
return extract_count
|
|
return extract_count
|
|
|
|
|
|
# 字符编码标准化
|
|
# 字符编码标准化
|
|
@@ -176,7 +197,7 @@ def repair_entity(prem,district_dict,list_articles):
|
|
role['role_text'] = city + role_text
|
|
role['role_text'] = city + role_text
|
|
|
|
|
|
|
|
|
|
-def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs):
|
|
|
|
|
|
+def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
|
|
cost_time = dict()
|
|
cost_time = dict()
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
@@ -352,7 +373,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
|
|
data_res["nlp_enterprise"] = nlp_enterprise
|
|
data_res["nlp_enterprise"] = nlp_enterprise
|
|
data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
|
|
data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
|
|
# 要素的个数
|
|
# 要素的个数
|
|
- data_res['extract_count'] = extractCount(data_res)
|
|
|
|
|
|
+ data_res['extract_count'] = extractCount(data_res,page_attachments)
|
|
# 是否有表格
|
|
# 是否有表格
|
|
data_res['exist_table'] = 1 if re.search("<td",text) else 0
|
|
data_res['exist_table'] = 1 if re.search("<td",text) else 0
|
|
data_res["cost_time"] = cost_time
|
|
data_res["cost_time"] = cost_time
|