|
@@ -43,7 +43,31 @@ class MyEncoder(json.JSONEncoder):
|
|
return obj
|
|
return obj
|
|
return json.JSONEncoder.default(self, obj)
|
|
return json.JSONEncoder.default(self, obj)
|
|
|
|
|
|
-def extractCount(extract_dict):
|
|
|
|
|
|
+def get_login_web_set():
|
|
|
|
+
|
|
|
|
+ file = os.path.join(os.path.dirname(__file__),"login_weblist.txt")
|
|
|
|
+ list_web = []
|
|
|
|
+ try:
|
|
|
|
+ if os.path.exists(file):
|
|
|
|
+ with open(file,"r",encoding="utf8") as f:
|
|
|
|
+ while 1:
|
|
|
|
+ line = f.readline()
|
|
|
|
+ if not line:
|
|
|
|
+ break
|
|
|
|
+ line = line.strip()
|
|
|
|
+ if line:
|
|
|
|
+ list_web.append(line)
|
|
|
|
+
|
|
|
|
+ except Exception as e:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+ _set = set(list_web)
|
|
|
|
+ log("get_login_web_set length %d"%(len(_set)))
|
|
|
|
+ return _set
|
|
|
|
+
|
|
|
|
+set_login_web = get_login_web_set()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extractCount(extract_dict,page_attachments,web_source_name):
|
|
# time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
|
|
# time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
|
|
|
|
|
|
if len(extract_dict):
|
|
if len(extract_dict):
|
|
@@ -113,6 +137,42 @@ def extractCount(extract_dict):
|
|
extract_count += 1
|
|
extract_count += 1
|
|
if project_name!="":
|
|
if project_name!="":
|
|
extract_count += 1
|
|
extract_count += 1
|
|
|
|
+
|
|
|
|
+ if page_attachments is not None and page_attachments!='':
|
|
|
|
+ try:
|
|
|
|
+ _attachments = json.loads(page_attachments)
|
|
|
|
+ has_zhaobiao = False
|
|
|
|
+ has_qingdan = False
|
|
|
|
+ if len(_attachments)>0:
|
|
|
|
+ for _atta in _attachments:
|
|
|
|
+ classification = _atta.get("classification","")
|
|
|
|
+ if str(classification)=='招标文件':
|
|
|
|
+ has_zhaobiao = True
|
|
|
|
+ if str(classification)=='采购清单':
|
|
|
|
+ has_qingdan = True
|
|
|
|
+ if has_zhaobiao:
|
|
|
|
+ extract_count += 3
|
|
|
|
+ if has_qingdan:
|
|
|
|
+ extract_count += 2
|
|
|
|
+ except Exception as e:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+ list_approval_dict = _extract.get("approval",[])
|
|
|
|
+ for _dict in list_approval_dict:
|
|
|
|
+ for k,v in _dict.items():
|
|
|
|
+ if v is not None and v!='' and v!="未知":
|
|
|
|
+ extract_count += 1
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ punish_dict = _extract.get("punish",{})
|
|
|
|
+ for k,v in punish_dict.items():
|
|
|
|
+ if v is not None and v!='' and v!="未知":
|
|
|
|
+ extract_count += 1
|
|
|
|
+
|
|
|
|
+ if web_source_name in set_login_web:
|
|
|
|
+ extract_count -= 1
|
|
|
|
+
|
|
return extract_count
|
|
return extract_count
|
|
|
|
|
|
# 字符编码标准化
|
|
# 字符编码标准化
|
|
@@ -176,7 +236,7 @@ def repair_entity(prem,district_dict,list_articles):
|
|
role['role_text'] = city + role_text
|
|
role['role_text'] = city + role_text
|
|
|
|
|
|
|
|
|
|
-def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs):
|
|
|
|
|
|
+def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
|
|
cost_time = dict()
|
|
cost_time = dict()
|
|
|
|
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
@@ -391,12 +451,16 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
|
|
data_res["dict_enterprise"] = dict_enterprise
|
|
data_res["dict_enterprise"] = dict_enterprise
|
|
|
|
|
|
# 要素的个数
|
|
# 要素的个数
|
|
- data_res['extract_count'] = extractCount(data_res)
|
|
|
|
|
|
+ data_res['extract_count'] = extractCount(data_res,page_attachments,web_source_name)
|
|
# 是否有表格
|
|
# 是否有表格
|
|
data_res['exist_table'] = 1 if re.search("<td",text) else 0
|
|
data_res['exist_table'] = 1 if re.search("<td",text) else 0
|
|
data_res["cost_time"] = cost_time
|
|
data_res["cost_time"] = cost_time
|
|
data_res["success"] = True
|
|
data_res["success"] = True
|
|
|
|
|
|
|
|
+ # 拟在建需建索引字段
|
|
|
|
+ data_res["proportion"] = pb_json.get('pb').get('proportion', '')
|
|
|
|
+ data_res["pb_project_name"] = pb_json.get('pb').get('project_name_refind', '')
|
|
|
|
+
|
|
# for _article in list_articles:
|
|
# for _article in list_articles:
|
|
# log(_article.content)
|
|
# log(_article.content)
|
|
#
|
|
#
|