|
@@ -43,7 +43,31 @@ class MyEncoder(json.JSONEncoder):
|
|
|
return obj
|
|
|
return json.JSONEncoder.default(self, obj)
|
|
|
|
|
|
-def extractCount(extract_dict,page_attachments):
|
|
|
+def get_login_web_set():
|
|
|
+
|
|
|
+ file = os.path.join(os.path.dirname(__file__),"login_weblist.txt")
|
|
|
+ list_web = []
|
|
|
+ try:
|
|
|
+ if os.path.exists(file):
|
|
|
+ with open(file,"r",encoding="utf8") as f:
|
|
|
+ while 1:
|
|
|
+ line = f.readline()
|
|
|
+ if not line:
|
|
|
+ break
|
|
|
+ line = line.strip()
|
|
|
+ if line:
|
|
|
+ list_web.append(line)
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+ _set = set(list_web)
|
|
|
+ log("get_login_web_set length %d"%(len(_set)))
|
|
|
+ return _set
|
|
|
+
|
|
|
+set_login_web = get_login_web_set()
|
|
|
+
|
|
|
+
|
|
|
+def extractCount(extract_dict,page_attachments,web_source_name):
|
|
|
# time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
|
|
|
|
|
|
if len(extract_dict):
|
|
@@ -134,6 +158,8 @@ def extractCount(extract_dict,page_attachments):
|
|
|
traceback.print_exc()
|
|
|
pass
|
|
|
|
|
|
+ if web_source_name in set_login_web:
|
|
|
+ extract_count -= 1
|
|
|
|
|
|
return extract_count
|
|
|
|
|
@@ -405,7 +431,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
|
|
|
data_res["dict_enterprise"] = dict_enterprise
|
|
|
|
|
|
# 要素的个数
|
|
|
- data_res['extract_count'] = extractCount(data_res,page_attachments)
|
|
|
+ data_res['extract_count'] = extractCount(data_res,page_attachments,web_source_name)
|
|
|
# 是否有表格
|
|
|
data_res['exist_table'] = 1 if re.search("<td",text) else 0
|
|
|
data_res["cost_time"] = cost_time
|