Преглед изворни кода

要素提取补充AI提取

luojiehua пре 2 месеци
родитељ
комит
d7eb966b17

+ 2 - 0
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -1273,6 +1273,8 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
 
     def should_to_extract_ai(self,extract_json):
+
+        return False
         _extract = {}
         if extract_json is not None:
             try:

+ 70 - 1
BaseDataMaintenance/model/ots/document_tmp.py

@@ -440,9 +440,78 @@ def export_extract_ai():
     df = pd.DataFrame(list_data)
     df.to_excel("%s.xlsx"%(filename))
 
+def export_extract_ai1():
+    filename = r"C:\Users\Administrator\Desktop\extract_ai.txt"
+    list_docid = []
+    from BaseDataMaintenance.dataSource.source import getConnect_ots
+    from BaseDataMaintenance.model.ots.document import Document
+    ots_client = getConnect_ots()
+    with open(filename,"r",encoding="utf8") as f:
+        while 1:
+            line = f.readline()
+            if not line:
+                break
+            line = line.strip()
+            if line!="":
+                try:
+                    docid = line.split(":")[-1]
+                    print(docid)
+                    list_docid.append(int(docid))
+                except Exception as e:
+                    pass
+    list_docid.reverse()
+    import pandas as pd
+    list_data = []
+    _count = 0
+    for docid in list_docid:
+        _count += 1
+        print("%d/%d"%(_count,len(list_docid)))
+        partitionkey = docid%500+1
+        _d = {document_tmp_partitionkey:partitionkey,
+              document_tmp_docid:docid}
+        dtmp = Document(_d)
+        if dtmp.fix_columns(ots_client,["extract_json","docchannel"],True):
+            docchannel = dtmp.getProperties().get("docchannel",0)
+            extract_json = dtmp.getProperties().get("extract_json")
+            _extract = json.loads(extract_json)
+            if docchannel in (52,101,118,119,120,121,122):
+                docchannel_dict = {52:"招标公告",
+                                   101:"中标公告",
+                                   118:"废标公告",
+                                   119:"候选人公示",
+                                   120:"合同公告",
+                                   121:"开标记录",
+                                   122:"验收合同"}
+                changed_tenderee = ""
+                changed_win_tenderer = ""
+                changed_win_price = ""
+                prem = _extract.get("prem",{})
+                for pack,pack_value in prem.items():
+                    rolelist = pack_value.get("roleList",[])
+                    for _role in rolelist:
+                        if _role.get("address") is None:
+                            if _role.get("role_name","")=="tenderee":
+                                changed_tenderee = _role.get("role_text","")
+                            if _role.get("role_name","")=="win_tenderer":
+                                changed_win_tenderer = _role.get("role_text","")
+                                changed_win_price = _role.get("role_money",{}).get("money")
+                if changed_tenderee!="" or changed_win_tenderer!="" or changed_win_price!="":
+                    data_d = {
+                        "docid":docid,
+                        "公告类型":docchannel_dict.get(docchannel,""),
+                        "招标人":changed_tenderee,
+                        "中标人":changed_win_tenderer,
+                        "中标金额":changed_win_price,
+                    }
+                    list_data.append(data_d)
+                    # if len(list_data)>=200:
+                    #     break
+    df = pd.DataFrame(list_data)
+    df.to_excel("%s.xlsx"%(filename))
 
 
 if __name__=="__main__":
     # turn_extract_status()
     # turn_document_tmp_status()
-    export_extract_ai()
+    # export_extract_ai()
+    export_extract_ai1()