Browse Source

要素提取补充AI提取,项目表增加docchannel

luojiehua 2 tháng trước cách đây
mục cha
commit
3ddfb38873

+ 3 - 2
BaseDataMaintenance/AIUtils/DoubaoUtils.py

@@ -11,8 +11,9 @@ client = Ark(
 )
 
 model_name = "ep-20250212111145-fflr7" #1.5pro 256k
-# model_name = "ep-20241226161807-95wp8" #128k
-# model_name = "ep-20241225142604-lv25x" #32k
+model_name = "ep-20250314164242-jd62g" #1.5pro 32k
+# model_name = "ep-20241226161807-95wp8" #lite 128k
+# model_name = "ep-20241225142604-lv25x" #lite 32k
 
 def chat_doubao(msg,stream=False,retry_time=3,model_name=model_name):
 

+ 4 - 2
BaseDataMaintenance/AIUtils/prompts.py

@@ -2,7 +2,9 @@
 
 
 
-def get_prompt_extract_role(_text):
+def get_prompt_extract_role(_text,max_length=30000):
+    if len(_text)>max_length:
+        _text = _text[:max_length-10000]+_text[-10000:]
     _prompt = '''
     #公告内容开始:
     -------------------------------
@@ -27,6 +29,6 @@ def get_prompt_extract_role(_text):
     8. 项目预算和中标金额返回金额+单位的格式
     #返回结果
     结果返回json格式{"招标信息":{"招标人名称":"","项目预算":"","招标人联系方式":[{"联系人":"","联系电话":""}]},"中标信息":[{"中标人名称":"","中标金额":"":"标段号":""}]}
-    '''%(_text)
+    '''%(_text[:max_length])
     return _prompt
 

+ 29 - 4
BaseDataMaintenance/maintenance/2.py

@@ -1,9 +1,34 @@
 #coding:utf8
 
-import re
+import pandas as pd
+
+filename = r"C:\Users\Administrator\Desktop\extract_ai.txt.xlsx"
 
-a = '19028,919.1万元'
+df = pd.read_excel(filename)
+
+import re
 
-b = re.search(r'[\d,,\.]+[亿万元人民币]+',a)
+def clean_ai_entity(entity):
+    if isinstance(entity,str):
+        if re.search("(未|无)(明确|提及)|某(部|单位|医院|公司)|\*\*|XX|登录|详见|招标单位",entity) is not None:
+            print(entity)
+            return ""
+        if re.search("无|区|县|市|省|某|中心|部|公司",entity) is not None and len(entity)<=5:
+            print(entity)
+            return ""
+        return entity
+    return ""
 
-print(b.group())
+list_data = []
+for i in range(len(df)):
+    docid = df.iloc[i]["docid"]
+    tenderee = df.iloc[i]["招标人"]
+    win_tenderer = df.iloc[i]["中标人"]
+    win_price = df.iloc[i]["中标金额"]
+    new_dict = {"docid":docid,
+                "tenderee":clean_ai_entity(tenderee),
+                "win_tenderer":clean_ai_entity(win_tenderer),
+                "win_price":win_price}
+    list_data.append(new_dict)
+df1 = pd.DataFrame(list_data)
+df1.to_excel("%s.xlsx"%(filename))

+ 11 - 5
BaseDataMaintenance/maintenance/dataflow.py

@@ -3080,16 +3080,19 @@ class Dataflow_dumplicate(Dataflow):
                     continue
             if v is None or v=="" or v=="[]" or v=="未知":
                 continue
-            if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_candidates,project_zhong_biao_page_time,project_zhao_biao_page_time):
+            if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_candidates,project_zhong_biao_page_time,project_zhao_biao_page_time,project_page_time,project_docchannel):
                 continue
             _dict[k] = v
 
-
         for _proj in projects:
             _proj.update(_dict)
         for _proj in projects:
-            if _proj.get(project_page_time,"")<project_dict.get(project_page_time,""):
+            if _proj.get(project_page_time,"")<=project_dict.get(project_page_time,""):
                 _proj[project_page_time] = project_dict.get(project_page_time,"")
+                _proj[project_docchannel] = project_dict.get(project_docchannel,"")
+            else:
+                if project_docchannel in project_dict:
+                    project_dict.pop(project_docchannel)
             if _proj.get(project_zhong_biao_page_time,"")>project_dict.get(project_zhong_biao_page_time,""):
                 _proj[project_zhong_biao_page_time] = project_dict.get(project_zhong_biao_page_time,"")
             if _proj.get(project_zhao_biao_page_time,"")>project_dict.get(project_zhao_biao_page_time,""):
@@ -3621,7 +3624,8 @@ class Dataflow_dumplicate(Dataflow):
             project_nlp_enterprise_attachment,
             project_tenderee_code,
             project_agency_code,
-            project_candidates
+            project_candidates,
+            project_docchannel
         ],sort="page_time",table_name="project2",table_index="project2_index")
 
         return list_project_dict
@@ -4074,6 +4078,7 @@ class Dataflow_dumplicate(Dataflow):
             _time = time.time()
             list_projects = self.search_projects_with_document(list_docids)
             log("search %d projects takes:%.3f"%(len(list_projects),time.time()-_time))
+
             if len(list_projects)==0:
                 # _time = time.time()
                 list_docs = self.search_docs(list_docids)
@@ -4087,6 +4092,7 @@ class Dataflow_dumplicate(Dataflow):
                 # log("update projects takes:%.3f"%(time.time()-_time))
             _time = time.time()
             list_projects = dumplicate_projects(list_projects)
+
             # log("dumplicate projects takes:%.3f"%(time.time()-_time))
             _time = time.time()
             list_projects = self.merge_projects(list_projects,b_log)
@@ -4896,7 +4902,7 @@ if __name__ == '__main__':
     # test_attachment_interface()
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
-    df_dump.test_dumplicate(536342520
+    df_dump.test_dumplicate(596183012
                             )
     # compare_dumplicate_check()
     # df_dump.test_merge([391898061

+ 13 - 9
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -1283,21 +1283,12 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                 pass
 
 
-
         has_entity = False
-        has_tenderee = False
-        has_win_tenderer = False
 
         docchannel = _extract.get("docchannel",{}).get("docchannel","")
         if len(_extract.get("dict_enterprise",{}).keys())>0:
             has_entity = True
         prem = _extract.get("prem",{})
-        for k,v in prem.items():
-            for role in v.get("roleList",[]):
-                if role.get("role_name")=="tenderee":
-                    has_tenderee = True
-                if role.get("role_name")=="win_tenderer":
-                    has_win_tenderer = True
 
         has_tenderee = False
         has_win_tenderer = False
@@ -1426,6 +1417,17 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
             b = re.search(r'[\d,,\.]+[亿万元人民币]+',_text)
             if b is not None:
                 return b.group()
+
+        def clean_ai_entity(entity):
+            if isinstance(entity,str):
+                if re.search("(未|无)(明确|提及)|某(部|单位|医院|公司)|\*\*|XX|登录|详见|招标单位",entity) is not None:
+                    print(entity)
+                    return ""
+                if re.search("无|区|县|市|省|某|中心|部|公司",entity) is not None and len(entity)<=5:
+                    print(entity)
+                    return ""
+                return entity
+            return ""
         _extract = {}
         if extract_json is not None:
             try:
@@ -1492,6 +1494,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
         _changed = False
         if not has_tenderee:
             _tenderee_ai = _extract_ai.get("招标信息",{}).get("招标人名称")
+            _tenderee_ai = clean_ai_entity(_tenderee_ai)
             _contacts = _extract_ai.get("招标信息",{}).get("招标人联系方式",[])
             _linklist = []
             for _conta in _contacts:
@@ -1547,6 +1550,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                     else:
                         _win_money = 0
                     _win_tenderer = _win_dict.get("中标人名称")
+                    _win_tenderer = clean_ai_entity(_win_tenderer)
                     if _win_tenderer!="" and len(_win_tenderer)>=4:
                         _role_dict = {
                             "role_name": "win_tenderer",

+ 11 - 2
BaseDataMaintenance/maxcompute/documentMerge.py

@@ -20,6 +20,7 @@ project_docids = "docids"
 project_zhao_biao_page_time = "zhao_biao_page_time"
 project_zhong_biao_page_time = "zhong_biao_page_time"
 project_page_time = "page_time"
+project_docchannel = "docchannel"
 project_doctextcon = "doctextcon"
 project_area = "area"
 project_province = "province"
@@ -1628,6 +1629,7 @@ def generate_common_properties(list_docs):
 
     list_product = []
     p_page_time = ""
+    p_docchanel = 0
     remove_docids = set()
     set_nlp_enterprise = set()
     set_nlp_enterprise_attachment = set()
@@ -1639,7 +1641,7 @@ def generate_common_properties(list_docs):
         status = _doc.get(document_status,0)
         _save = _doc.get(document_tmp_save,1)
         doctitle = _doc.get(document_doctitle,"")
-        docchannel = _doc.get(document_docchannel)
+        docchannel = _doc.get(document_docchannel,0)
         page_time = _doc.get(document_page_time,"")
         _docid = _doc.get(document_docid)
         _bidway = _doc.get(document_bidway,"")
@@ -1683,6 +1685,11 @@ def generate_common_properties(list_docs):
 
         if p_page_time=="":
             p_page_time = page_time
+            p_docchanel = docchannel
+        else:
+            if p_page_time<page_time:
+               p_page_time = page_time
+               p_docchanel = docchannel
 
         if zhao_biao_page_time=="" and _docchannel in (51,52,102,103,114):
             zhao_biao_page_time = page_time
@@ -1728,6 +1735,7 @@ def generate_common_properties(list_docs):
         project_dict[project_zhong_biao_page_time] = zhong_biao_page_time
     project_dict[project_project_codes] = ",".join(list(set(list_codes)))
     project_dict[project_page_time] = p_page_time
+    project_dict[project_docchannel] = p_docchanel
     project_dict[project_product] = ",".join(list(set(list_product)))
     project_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
     project_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
@@ -2247,8 +2255,9 @@ def update_projects_by_project(project_dict,projects):
 
     for _proj in projects:
         _proj.update(_dict)
-        if str(_proj.get(project_page_time,""))<str(project_dict.get(project_page_time,"")):
+        if str(_proj.get(project_page_time,""))<=str(project_dict.get(project_page_time,"")):
             _proj[project_page_time] = project_dict.get(project_page_time,"")
+            _proj[project_docchannel] = project_dict.get(project_docchannel,"")
         if project_dict.get(project_sub_project_name) is not None and project_dict.get(project_sub_project_name) not in {"","Project"}:
             if not (_proj.get(project_sub_project_name) is not None and _proj.get(project_sub_project_name) not in {"","Project"}):
                 _proj[project_sub_project_name] = project_dict.get(project_sub_project_name)

+ 1 - 0
BaseDataMaintenance/model/ots/project.py

@@ -5,6 +5,7 @@ project_docids = "docids"
 project_zhao_biao_page_time = "zhao_biao_page_time"
 project_zhong_biao_page_time = "zhong_biao_page_time"
 project_page_time = "page_time"
+project_docchannel = "docchannel"
 project_doctextcon = "doctextcon"
 project_doctitles = "doctitles"
 project_area = "area"