Quellcode durchsuchen

extract_count增加附件影响

luojiehua vor 11 Monaten
Ursprung
Commit
0046258077
3 geänderte Dateien mit 27 neuen und 5 gelöschten Zeilen
  1. 1 1
      .idea/misc.xml
  2. 24 3
      BiddingKG/dl/interface/extract.py
  3. 2 1
      BiddingKG/run_extract_server.py

+ 1 - 1
.idea/misc.xml

@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_15" project-jdk-name="Python 3.5 (BiddingKG)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_15" project-jdk-name="Python 3.7 (py37)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>

+ 24 - 3
BiddingKG/dl/interface/extract.py

@@ -43,7 +43,7 @@ class MyEncoder(json.JSONEncoder):
             return obj
         return json.JSONEncoder.default(self, obj)
 
-def extractCount(extract_dict):
+def extractCount(extract_dict,page_attachments):
     # time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
 
     if len(extract_dict):
@@ -113,6 +113,27 @@ def extractCount(extract_dict):
         extract_count += 1
     if project_name!="":
         extract_count += 1
+
+    if page_attachments is not None and page_attachments!='':
+        try:
+            _attachments = json.loads(page_attachments,"")
+            has_zhaobiao = False
+            has_qingdan = False
+            if len(_attachments)>0:
+                for _atta in _attachments:
+                    classification = _atta.get("classification","")
+                    if str(classification)=='招标文件':
+                        has_zhaobiao = True
+                    if str(classification)=='采购清单':
+                        has_qingdan = True
+            if has_zhaobiao:
+                extract_count += 3
+            if has_qingdan:
+                extract_count += 2
+        except Exception as e:
+            pass
+
+
     return extract_count
 
 # 字符编码标准化
@@ -176,7 +197,7 @@ def repair_entity(prem,district_dict,list_articles):
                             role['role_text'] = city + role_text
 
 
-def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs):
+def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
     cost_time = dict()
 
     start_time = time.time()
@@ -352,7 +373,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     data_res["nlp_enterprise"] = nlp_enterprise
     data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
     # 要素的个数
-    data_res['extract_count'] = extractCount(data_res)
+    data_res['extract_count'] = extractCount(data_res,page_attachments)
     # 是否有表格
     data_res['exist_table'] = 1 if re.search("<td",text) else 0
     data_res["cost_time"] = cost_time

+ 2 - 1
BiddingKG/run_extract_server.py

@@ -81,11 +81,12 @@ def run_thread(data,list_result):
     web_source_no = data.get("web_source_no","")
     web_source_name = data.get("web_source_name","")
     original_docchannel = data.get("original_docchannel","")
+    page_attachments = data.get("page_attachments","")
     # print("web_source_name:",web_source_name)
     is_fail = False
     try:
         if _content!="":
-            data_res  = predict(_doc_id,_content,_title,_page_time,web_source_no=web_source_no,web_source_name=web_source_name,original_docchannel=original_docchannel)
+            data_res  = predict(_doc_id,_content,_title,_page_time,web_source_no=web_source_no,web_source_name=web_source_name,original_docchannel=original_docchannel,page_attachments=page_attachments)
         else:
             data_res = json.dumps({"success":False,"msg":"content not passed"})
             # is_fail = True