Bläddra i källkod

新增变更答疑公告 变更内容提取

znj 2 dagar sedan
förälder
incheckning
a5188cd88f
2 ändrade filer med 27 tillägg och 3 borttagningar
  1. 4 2
      BiddingKG/dl/interface/extract.py
  2. 23 1
      BiddingKG/dl/interface/outline_extractor.py

+ 4 - 2
BiddingKG/dl/interface/extract.py

@@ -318,11 +318,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     start_time = time.time()
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
-    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope = extract_parameters(parse_document)
+    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope, correction_content = extract_parameters(parse_document)
 
     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
-        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope = extract_parameters(parse_document)
+        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope, correction_content = extract_parameters(parse_document)
     # print('out_lines',out_lines)
     # if addr_bidopen_text == '':
     #     addr_bidopen_text = extract_addr(list_articles[0].content)
@@ -598,6 +598,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     data_res["proportion"] = pb_json.get('pb').get('proportion', '')
     data_res["pb_project_name"] = pb_json.get('pb').get('project_name_refind', '')
 
+    # 更正内容
+    data_res['correction_content'] = correction_content[:1500]
     # 资质要求
     data_res['aptitude'] = aptitude_text[:1500]
     # 采购内容

+ 23 - 1
BiddingKG/dl/interface/outline_extractor.py

@@ -67,6 +67,8 @@ pinmu_name_pattern = "采购品目(名称)?([::,]|$)"
 policy_pattern = "《.+?(通知|办法|条例|规定|规程|规范|须知|规则|标准|细则|意见|协议|条件|要求|手册|法典|方案|指南|指引|法)》"
 not_policy_pattern = "(表|函|书|证|\d页|公告|合同|文件|清单)》$|采购合同|响应方须知|响应文件格式|营业执照|开标一览|采购需求"
 
+correction_pattern = "(更正|更改|修正|修改|变更|延期)(信息|内容|事项|详情)"
+
 def extract_parameters(parse_document):
     '''
     通过大纲、预处理后文本正则获取需要字段
@@ -82,6 +84,7 @@ def extract_parameters(parse_document):
     winter_scope = [] # 中标信息始末位置
     pinmu_name = '' # 品目名称
     list_policy = [] # 政策法规
+    correction_content = "" # 更正内容
     out_lines = []
 
     _find_count = 0
@@ -132,6 +135,25 @@ def extract_parameters(parse_document):
                     _data_i += len(childs)
                     _data_i -= 1
 
+    _data_i = -1
+    # 更正内容
+    while _data_i < len(list_data) - 1:
+        _data_i += 1
+        _data = list_data[_data_i]
+        _type = _data["type"]
+        _text = _data["text"].strip()
+        if _type == "sentence":
+            if _data["sentence_title"] is not None:
+                if re.search(correction_pattern, _text[:20]) is not None:
+                    childs = get_childs([_data])
+                    correction_text = ""
+                    for c in childs:
+                        correction_text += c["text"].strip()
+                    # print('correction_text',correction_text)
+                    correction_content += correction_text
+                    _data_i += len(childs)
+                    _data_i -= 1
+
     _data_i = -1
     while _data_i<len(list_data)-1:
         _data_i += 1
@@ -221,7 +243,7 @@ def extract_parameters(parse_document):
         pinmu_name = pinmu_name[ser.end():]
         if re.search('[^\w]$', pinmu_name):
             pinmu_name = pinmu_name[:-1]
-    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope
+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy, winter_scope,correction_content
 
 def extract_addr(content):
     '''