瀏覽代碼

补充上传

luojiehua 4 月之前
父節點
當前提交
92ca7909f6

+ 8 - 1
.idea/dataSources.xml

@@ -12,7 +12,7 @@
       <synchronize>true</synchronize>
       <auto-commit>false</auto-commit>
       <jdbc-driver>oracle.jdbc.OracleDriver</jdbc-driver>
-      <jdbc-url>jdbc:oracle:thin:@121.46.18.113:10522:yanphone</jdbc-url>
+      <jdbc-url>jdbc:oracle:thin:@dianxin.bidizhaobiao.com:10522:yanphone</jdbc-url>
     </data-source>
     <data-source source="LOCAL" name="mysql_测试" uuid="19d8fc36-28e0-4de8-bed4-c356c7fd53cb">
       <driver-ref>mysql.8</driver-ref>
@@ -66,5 +66,12 @@
       <jdbc-url>jdbc:postgresql://116.62.141.83:5432/postgres</jdbc-url>
       <working-dir>$ProjectFileDir$</working-dir>
     </data-source>
+    <data-source source="LOCAL" name="mysql@localhost" uuid="8312b206-e701-448e-b945-8e554498c4d2">
+      <driver-ref>mysql.8</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
+      <jdbc-url>jdbc:mysql://localhost:3306/mysql</jdbc-url>
+      <working-dir>$ProjectFileDir$</working-dir>
+    </data-source>
   </component>
 </project>

+ 1 - 0
.idea/encodings.xml

@@ -11,6 +11,7 @@
     <file url="file://$PROJECT_DIR$/data/exportFind_tenderee1.csv" charset="GBK" />
     <file url="file://$PROJECT_DIR$/data/服务型客户.txt" charset="GBK" />
     <file url="file://$PROJECT_DIR$/dataSource/ossUtils.py" charset="UTF-8" />
+    <file url="file://$PROJECT_DIR$/export/html2text.py" charset="UTF-8" />
     <file url="file://$PROJECT_DIR$/test/industry_keyword_expand.py" charset="UTF-8" />
     <file url="file://$PROJECT_DIR$/test/拓展关键词.xlsx" charset="UTF-8" />
     <file url="file://$PROJECT_DIR$/utils/ERNIE_utils.py" charset="UTF-8" />

+ 1 - 1
.idea/misc.xml

@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (dl_nlp)" project-jdk-type="Python SDK">
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_19" project-jdk-name="Python 3.7 (py37)" project-jdk-type="Python SDK">
     <output url="file://$PROJECT_DIR$/out" />
   </component>
 </project>

+ 0 - 1
.idea/sqldialects.xml

@@ -3,6 +3,5 @@
   <component name="SqlDialectMappings">
     <file url="file://$PROJECT_DIR$/ddl" dialect="MySQL" />
     <file url="file://$PROJECT_DIR$/ddl_postgres" dialect="PostgreSQL" />
-    <file url="file://G:/要素提取标注备份/iepy_public_auth_group.sql" dialect="PostgreSQL" />
   </component>
 </project>

+ 1 - 1
DataMining.iml

@@ -3,7 +3,7 @@
   <component name="NewModuleRootManager" inherit-compiler-output="true">
     <exclude-output />
     <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Python 3.5 (dl_nlp)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.7 (py37)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
 </module>

文件差異過大導致無法顯示
+ 121 - 0
export/DoubaoUtils.py


+ 31 - 0
export/cleanAgency.py

@@ -0,0 +1,31 @@
+#coding:utf8
+
+import pandas as pd
+import re
+
+def clean():
+    filename = "20230227都大于100.xlsx"
+    df = pd.read_excel(filename)
+
+    _count = 0
+    df_data = {"name":[],
+               "zbn":[],
+               "dln":[],
+               "sn":[]}
+    for name,zbn,dln,sn in zip(df["nicknames"],df["zhao_biao_number"],df["dai_li_number"],df["same_number"]):
+        if dln>10000 and re.search("招标|咨询",name) is not None:
+            _count += 1
+            print(_count,name,zbn,dln,sn)
+            df_data["name"].append(name)
+            df_data["zbn"].append(zbn)
+            df_data["dln"].append(dln)
+            df_data["sn"].append(sn)
+    df = pd.DataFrame(df_data)
+    df.to_excel("daili_check.xlsx")
+
+
+
+
+
+if __name__ == '__main__':
+    clean()

二進制
export/docchannel.pk


文件差異過大導致無法顯示
+ 1027 - 124
export/exportDocument.py


+ 31 - 20
export/exportEnterprise.py

@@ -1052,12 +1052,22 @@ def attachColumn1():
     df.to_excel("全国剩下数据16570-1(2)11.xlsx")
 
 def exportContact():
-    filename = "../data/2023-03-06_190109_to_excel.xlsx"
-    df = pd.read_excel(filename)
-    list_ename = df["_id"]
+    filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-09\企业名称名单.txt"
+
+    list_name = []
+    with open(filename,"r",encoding="utf8") as f:
+        while 1:
+            line = f.readline()
+            if not line:
+                break
+            line = line.strip()
+            list_name.append(line)
+
+    # df = pd.read_excel(filename)
+    # list_name = df["name"]
 
     list_dict = []
-    for _en in list_ename:
+    for _en in list_name:
         if isinstance(_en,(str)) and _en!="":
             _dict = {"enterprise_name":_en}
             list_dict.append(_dict)
@@ -1072,28 +1082,28 @@ def exportContact():
 
         rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise","enterprise_index",
                                                                        SearchQuery(bool_query,limit=1),
-                                                                       columns_to_get=ColumnsToGet(["reg_location"],return_type=ColumnReturnType.SPECIFIED))
+                                                                       columns_to_get=ColumnsToGet(["tou_biao_number","zhong_biao_number"],return_type=ColumnReturnType.SPECIFIED))
         l_data = getRow_ots(rows)
         if len(l_data)>0:
             _d.update(l_data[0])
 
-        bool_query = BoolQuery(must_queries=[TermQuery("enterprise_name",_name),
-                                             BoolQuery(should_queries=[TermQuery("is_legal_person",1),
-                                                                       TermQuery("is_mobile",1)])])
-
-        rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
-                                                                       SearchQuery(bool_query,limit=5),
-                                                                       columns_to_get=ColumnsToGet(["enterprise_name","contact_person","phone_no","position"],return_type=ColumnReturnType.SPECIFIED))
-        l_data = getRow_ots(rows)
-        if len(l_data)>0:
-            _d.update(l_data[0])
+        # bool_query = BoolQuery(must_queries=[TermQuery("enterprise_name",_name),
+        #                                      BoolQuery(should_queries=[TermQuery("is_legal_person",1),
+        #                                                                TermQuery("is_mobile",1)])])
+        #
+        # rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
+        #                                                                SearchQuery(bool_query,limit=5),
+        #                                                                columns_to_get=ColumnsToGet(["enterprise_name","contact_person","phone_no","position"],return_type=ColumnReturnType.SPECIFIED))
+        # l_data = getRow_ots(rows)
+        # if len(l_data)>0:
+        #     _d.update(l_data[0])
 
     mt = MultiThreadHandler(task_queue,_handle,None,60)
     mt.run()
     df_data= {}
-    columns = ["name","contact_person","phone_no","reg_location"]
+    columns = ["name","tou_biao_number","zhong_biao_number"]
     for _d in list_dict:
-        if "phone_no" in _d:
+        if _d.get("tou_biao_number",0)>0 or _d.get("zhong_biao_number",0)>0:
             for c in columns:
                 if c not in df_data:
                     df_data[c] = []
@@ -1101,7 +1111,7 @@ def exportContact():
 
 
     df = pd.DataFrame(df_data)
-    df.to_excel("../data/%s_export_enterprise.xlsx"%(getCurrent_date(format="%Y-%m-%d_%H%M%S")),encoding="utf",columns=columns)
+    df.to_csv("../data/%s_export_enterprise.csv"%(getCurrent_date(format="%Y-%m-%d_%H%M%S")),encoding="utf",columns=columns)
 
 def getTycCompany():
 
@@ -1179,15 +1189,16 @@ def getTycCompany():
 
 if __name__=="__main__":
     # getTyc_company()
-    getTycCompany()
+    # getTycCompany()
     # exportEnterprise_by_bidNum()
     # print(getDictEnterprise(["南宁宏基建筑工程有限责任公司"],["phone"]))
     # exportEnterprise_by_phone()
     # make_Legal_enterprise()
     # transform_enterprise()
     # exportEnterprise()
-    # exportContact()
+    exportContact()
     # attachColumn()
+
     # attachColumn()
 
     # ots_client = getConnect_ots()

+ 118 - 1
export/exportEs.py

@@ -645,5 +645,122 @@ if __name__ == '__main__':
         },
         "_source":["contacts_enterprise_name","contacts_person_name","contacts_phone_no"]
     }
+
+    body = {
+        "_source": "_name",
+        'query': {  # 查询命令
+            "bool": {
+                "must":[
+                    # {"has_child":{
+                    #     "type":"contacts",
+                    #     "query":{
+                    #         "bool":{
+                    #             'must': [
+                    #                 {'match_phrase':
+                    #                      {'nicknames':'医院'
+                    #                       # {
+                    #                       # "query": "医院",  # >= 大于等于
+                    #                       # # # "lt": 1650038400000  # < 小于
+                    #                       # }
+                    #                       },
+                    #                  },
+                    #                 {'term':
+                    #                      {'contacts_is_legal_person':0
+                    #                       # {
+                    #                       # "query": "医院",  # >= 大于等于
+                    #                       # # # "lt": 1650038400000  # < 小于
+                    #                       # }
+                    #                       },
+                    #                  },{'term':
+                    #                         {'contacts_is_mobile':1
+                    #                          # {
+                    #                          # "query": "医院",  # >= 大于等于
+                    #                          # # # "lt": 1650038400000  # < 小于
+                    #                          # }
+                    #                          },
+                    #                     }
+                    #             ]
+                    #         }
+                    #     }
+                    # }},
+                    # {"range":{"zhong_biao_number":{"gt":0}}}
+                    {
+                        "range": {
+                            "status": {
+                                "gte": 201,
+                                "lte": 300
+                            }
+                        }
+                    },
+                    {
+                        "bool": {
+                            "should": [
+                                {
+                                    "match_phrase": {
+                                        "nicknames": "合同能源管理"
+                                    }
+                                },{
+                                    "match_phrase": {
+                                        "history_names": "合同能源管理"
+                                    }
+                                },{
+                                    "match_phrase": {
+                                        "alias": "合同能源管理"
+                                    }
+                                }
+                            ]
+                        }
+                    }
+
+                    # {'match_phrase':
+                    #      {'business_scope':'合同能源管理'
+                    #       # {
+                    #       # "query": "医院",  # >= 大于等于
+                    #       # # # "lt": 1650038400000  # < 小于
+                    #       # }
+                    #       },
+                    #  },
+                    # {"bool":{
+                    #     "should":[
+                    #         {"term":{
+                    #             "district":"鹤山"
+                    #         }
+                    #         }
+                    #     ]
+                    # }},
+                    # {"bool":{
+                    #     "should":list_should
+                    # }}
+
+                ]
+                # 'must': [
+                #
+                #
+                #     # {"range":{"tyc_id":{"gt":1111}}}
+                #     # {'match_phrase':
+                #     #     {'contacts_is_legal_person':1
+                #     #         # {
+                #     #         # "query": "医院",  # >= 大于等于
+                #     #         # # # "lt": 1650038400000  # < 小于
+                #     #         # }
+                #     #     },
+                #     # },{'match_phrase':
+                #     #                        {'contacts_is_mobile':1
+                #     #                         # {
+                #     #                         # "query": "医院",  # >= 大于等于
+                #     #                         # # # "lt": 1650038400000  # < 小于
+                #     #                         # }
+                #     #                         },
+                #     #    }
+                # ]
+
+            },
+
+        },
+        # "sort": [
+        #         {"id": "desc"}
+        #     ]
+        "_source":"name"
+    }
     list_result.extend(search_data(es,body,10000000,False))
-    data_to_excel(list_result,["contacts_enterprise_name","contacts_person_name","contacts_phone_no"])
+    data_to_excel(list_result,["name"])

+ 157 - 1
export/exportProject.py

@@ -16,6 +16,7 @@ import traceback
 from utils.hashUtil import aesCipher
 from export.exportEnterprise import getDictEnterprise,getOneContact
 from export.exportUtils import generateBoolShouldQuery
+from queue import Queue
 
 
 data_path = "../data/"
@@ -444,8 +445,163 @@ def appendCellphones():
 
     file = "../data/"
 
+from export.exportUtils import *
+import jieba
+
+def export_industry_keywords_by_enterprise(list_enterprise):
+
+    task_queue = Queue()
+    result_queue = Queue()
+
+    list_query = []
+    for _enterprise in list_enterprise:
+        _query = BoolQuery(must_queries=[
+            TermQuery("win_tenderer",_enterprise),
+            RangeQuery("status",201,301)
+        ])
+        list_query.append({"query":_query,"limit":1000})
+
+    list_data = getDocument(list_query,columns=["docid","doctitles","project_name","product","win_tenderer"],
+                            table_name="project2",
+                            table_index="project2_index")
+    dict_keywords = {}
+    dict_keywords_product = {}
+    dict_keywords_product_count = {}
+    for _data in list_data:
+        doctitles = _data.get("doctitles","")
+        project_name = _data.get("project_name","")
+        product = _data.get("product","")
+        for _keyword in doctitles.split(","):
+            for _word in jieba.cut(_keyword):
+                if _word in dict_keywords:
+                    dict_keywords[_word] += 1
+                else:
+                    dict_keywords[_word] = 1
+        for _keyword in project_name.split(","):
+            for _word in jieba.cut(_keyword):
+                if _word in dict_keywords:
+                    dict_keywords[_word] += 1
+                else:
+                    dict_keywords[_word] = 1
+        for _keyword in product.split(","):
+            if _keyword in dict_keywords_product_count:
+                dict_keywords_product_count[_keyword] += 1
+            else:
+                dict_keywords_product_count[_keyword] = 1
+            for _word in jieba.cut(_keyword):
+                if _word in dict_keywords:
+                    dict_keywords[_word] += 1
+                else:
+                    dict_keywords[_word] = 1
+
+                if _word in dict_keywords_product:
+                    dict_keywords_product[_word] += 1
+                else:
+                    dict_keywords_product[_word] = 1
+
+    list_keywords = []
+    for _keyword,count in dict_keywords.items():
+        list_keywords.append([_keyword,count])
+    list_keywords.sort(key=lambda x:x[1],reverse=True)
+    list_keywords.insert(0,["关键词","数量"])
+    list_keywords = list_keywords[:10000]
+
+    list_keywords_product = []
+    for _keyword,count in dict_keywords_product.items():
+        list_keywords_product.append([_keyword,count])
+    list_keywords_product.sort(key=lambda x:x[1],reverse=True)
+    list_keywords_product.insert(0,["关键词","数量"])
+    list_keywords_product = list_keywords_product[:10000]
+
+    list_keywords_product_count = []
+    for _keyword,count in dict_keywords_product_count.items():
+        list_keywords_product_count.append([_keyword,count])
+    list_keywords_product_count.sort(key=lambda x:x[1],reverse=True)
+    list_keywords_product_count.insert(0,["关键词","数量"])
+    list_keywords_product_count = list_keywords_product_count[:10000]
+
+    filename = "../data/%s_行业关键词.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S"))
+    with pd.ExcelWriter(filename) as writer:
+        df_1 = pd.DataFrame(list_data)
+        df_1.to_excel(writer,sheet_name="项目数据")
+        df_data = pd.DataFrame(list_keywords)
+        df_data.to_excel(writer,sheet_name="标题项目名称产品词频统计")
+        df_data = pd.DataFrame(list_keywords_product)
+        df_data.to_excel(writer,sheet_name="产品词频统计")
+        df_data = pd.DataFrame(list_keywords_product_count)
+        df_data.to_excel(writer,sheet_name="产品项目词频统计")
+
+
+def turn_structure():
+    filename = r'G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2025-06\食堂食材行业关键词.xlsx'
+    df = pd.read_excel(filename)
+    list_product_exclude = df["产品排除词"]
+    list_title_exclude = df["标题排除词"]
+    list_keywords = df["标题+正文关键词"]
+    list_title_exclude = [a for a in list_title_exclude if isinstance(a,str)]
+    list_keywords = [a for a in list_keywords if isinstance(a,str)]
+    list_product_exclude = [a for a in list_product_exclude if isinstance(a,str)]
+    list_data = [["行业","全文关键词","全文排除词","标题排除词","产品排除词"]]
+    list_data.append(["食堂食材","、".join(list_keywords),"","、".join(list_title_exclude),"、".join(list_product_exclude)])
+    df1 = pd.DataFrame(list_data)
+    df1.to_excel("../data/%s_行业关键词.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")),sheet_name="行业关键词")
+
 
 if __name__=="__main__":
-    exportProject_by_pagetime()
+    # exportProject_by_pagetime()
     # exportProjectWithOneDocid()
     # exportCompanyByCycleProduct()
+    turn_structure()
+#     list_enterprise = splitIntoList('''
+#     明喆集团股份有限公司
+# 招商积余物业管理有限公司
+# 广州粤华物业有限公司
+# 广州广电城市服务集团股份有限公司
+# 绿城物业服务集团有限公司
+# 龙城城市运营服务集团有限公司
+# 深业物业运营集团股份有限公司
+# 广东宏德科技物业有限公司
+# 保利物业服务股份有限公司
+# 新大正物业集团股份有限公司
+# 山东明德物业管理集团有限公司
+# 深圳市金地物业管理有限公司
+# 上海复欣物业管理发展有限公司
+# 招商局物业管理有限公司
+# 东吴服务产业集团(江苏)有限公司
+# 碧桂园生活服务集团股份有限公司
+# 天津市金玉物业管理有限公司
+# 润加物业服务(深圳)有限公司
+# 山东宏泰物业发展有限公司
+# 爱玛客服务产业(中国)有限公司
+# 中海物业管理有限公司
+# 浙江亚太酒店物业服务有限公司
+# 深圳万物商企物业服务有限公司
+# 天津天孚物业管理有限公司
+# 上海益中亘泰(集团)股份有限公司
+# 上海申勤物业管理服务有限公司
+# 广东公诚设备资产服务有限公司
+# 上海东湖物业管理有限公司
+# 天津市赛驰物业服务有限公司
+# 安徽省长城物业管理有限公司
+# 深圳市万科物业服务有限公司
+# 上海生乐物业管理有限公司
+# 金科智慧服务集团股份有限公司
+# 山东润华物业管理有限公司
+# 国药诺达物业服务有限公司
+# 深业置地(深圳)物业管理有限公司
+# 广东华信服务集团有限公司
+# 天津玉龙源物业管理服务股份有限公司
+# 广州珠江城市管理服务集团股份有限公司
+# 中航物业管理有限公司
+# 深圳市广美隆物业清洁服务有限公司
+# 浙江浙大新宇物业集团有限公司
+# 金融街物业股份有限公司
+# 天津峥嵘物业管理有限公司
+# 新疆德泰保安服务有限公司
+# 北京住总北宇物业服务有限责任公司
+# 广东泰科物业管理有限公司
+# 德州市陵城区人才发展集团有限公司
+# 乌鲁木齐阳光管道物业服务有限公司
+# 深业物业集团有限公司
+#     ''',_splitkeys="\n|\s")
+#     export_industry_keywords_by_enterprise(list_enterprise)

+ 17 - 17
jobs/exportJobs.py

@@ -37,7 +37,7 @@ def export_medicine_friday():
                     host = "smtp.exmail.qq.com"
                     username = "vip@bidizhaobiao.com"
                     password = "Biaoxun66-"
-                    receivers = ["1985262186@qq.com","1175730271@qq.com","1265797328@qq.com","1289358902@qq.com"]
+                    receivers = ["1985262186@qq.com","1265797328@qq.com","1289358902@qq.com"]
                     attachs = [filename]
 
                     sendEmail(host,username,password,receivers,attachs=attachs)
@@ -139,7 +139,7 @@ def export2():
                     host = "smtp.exmail.qq.com"
                     username = "vip@bidizhaobiao.com"
                     password = "Biaoxun66-"
-                    receivers = ["1175730271@qq.com","493894608@qq.com"]
+                    receivers = ["493894608@qq.com"]
                     # receivers = ["1175730271@qq.com"]
                     attachs = [filename]
 
@@ -280,7 +280,7 @@ def export5():
                     host = "smtp.exmail.qq.com"
                     username = "vip@bidizhaobiao.com"
                     password = "Biaoxun66-"
-                    receivers = ["1175730271@qq.com","365531448@qq.com"]
+                    receivers = ["365531448@qq.com"]
                     # receivers = ["1175730271@qq.com"]
                     attachs = [filename]
 
@@ -741,7 +741,7 @@ class Export3():
             username = "vip1@bidizhaobiao.com"
             password = "Biaoxun666+"
             # receivers = ["1175730271@qq.com","995116318@qq.com","huangxiaofang@cvte.com"]
-            receivers = ["1175730271@qq.com","1208135584@qq.com","youyuer@cvte.com","chenyuxue@cvte.com"]
+            receivers = ["1208135584@qq.com","544329183@qq.com","youyuer@cvte.com","chenyuxue@cvte.com"]
 
             # receivers = ["1175730271@qq.com"]
 
@@ -1009,7 +1009,7 @@ class Export3():
             username = "vip1@bidizhaobiao.com"
             password = "Biaoxun666+"
             # receivers = ["1175730271@qq.com","995116318@qq.com","huangxiaofang@cvte.com"]
-            receivers = ["1175730271@qq.com","1208135584@qq.com","wanghongyan@cvte.com"]
+            receivers = ["1208135584@qq.com","544329183@qq.com","wanghongyan@cvte.com"]
 
             # receivers = ["1175730271@qq.com"]
 
@@ -1335,8 +1335,8 @@ class Export3():
         host = "smtp.exmail.qq.com"
         username = "vip@bidizhaobiao.com"
         password = "Biaoxun66-"
-        receivers = ["1175730271@qq.com","747012698@qq.com","995116318@qq.com"]
-        receivers = ["1175730271@qq.com"]
+        receivers = ["747012698@qq.com","995116318@qq.com"]
+        # receivers = ["1175730271@qq.com"]
         attachs = [filename]
 
         sendEmail(host,username,password,receivers,attachs=attachs)
@@ -1667,8 +1667,8 @@ class Export3():
         host = "smtp.exmail.qq.com"
         username = "vip@bidizhaobiao.com"
         password = "Biaoxun66-"
-        receivers = ["1175730271@qq.com","747012698@qq.com","995116318@qq.com"]
-        receivers = ["1175730271@qq.com"]
+        receivers = ["747012698@qq.com","995116318@qq.com"]
+        # receivers = ["1175730271@qq.com"]
         attachs = [filename]
 
         sendEmail(host,username,password,receivers,attachs=attachs)
@@ -2013,7 +2013,7 @@ def export_15824381998():
                 host = "smtp.exmail.qq.com"
                 username = "vip@bidizhaobiao.com"
                 password = "Biaoxun66-"
-                receivers = ["1175730271@qq.com","565748324@qq.com","1396488964@qq.com"]
+                receivers = ["565748324@qq.com"]
                 # receivers = ["1175730271@qq.com"]
                 attachs = [filename]
 
@@ -2100,7 +2100,7 @@ def export_15824381998():
                 host = "smtp.exmail.qq.com"
                 username = "vip@bidizhaobiao.com"
                 password = "Biaoxun66-"
-                receivers = ["1175730271@qq.com","565748324@qq.com","1396488964@qq.com","1141385052@qq.com","1713739820@qq.com"]
+                receivers = ["565748324@qq.com","1141385052@qq.com","1713739820@qq.com"]
                 # receivers = ["1175730271@qq.com"]
                 attachs = [filename]
 
@@ -2535,7 +2535,7 @@ def export_13510123669():
                 host = "smtp.exmail.qq.com"
                 username = "vip@bidizhaobiao.com"
                 password = "Biaoxun66-"
-                # receivers = ["724949655@qq.com","1396488964@qq.com"]
+                # receivers = ["724949655@qq.com"]
                 receivers = ["md47@zuowei.com"]
 
                 # receivers = ["1175730271@qq.com"]
@@ -2557,8 +2557,8 @@ def job_export():
     _scheduler.add_job(export2,"cron",hour=9)
     _scheduler.add_job(export5,"cron",hour=16)
     # _scheduler.add_job(e3.export4,"cron",hour=7)
-    _scheduler.add_job(e3.trytimes,"cron",hour=21)
-    _scheduler.add_job(e3.export3_1,"cron",hour=21)
+    # _scheduler.add_job(e3.trytimes,"cron",hour=21)
+    # _scheduler.add_job(e3.export3_1,"cron",hour=21)
     _scheduler.add_job(export_15824381998,"cron",hour=21)
     _scheduler.add_job(export_13510123669,"cron",hour=10)
     _scheduler.start()
@@ -2568,14 +2568,14 @@ if __name__=="__main__":
     # export_medicine_friday()
     # export2()
     # export_document_except()
-    # e3 = Export3()
+    e3 = Export3()
     # e3.export4_by_project()
     # e3.export4()
-    # e3.trytimes()
+    e3.trytimes()
     # e3.export3_1()
     # export5()
     # export_15824381998()
-    export_13510123669()
+    # export_13510123669()
 
 
 

+ 91 - 0
mining/ChatAgent.py

@@ -0,0 +1,91 @@
+#coding:utf8
+
+import requests
+import time
+import traceback
+import json
+
+agent_id = "7458183497300492338"
+url = "https://api.coze.cn/v3/chat"
+Authorization = "Bearer pat_XXlExeHd1loVyHY1kV7Z38GjBERfmvOAvhfxCtXSYLOzNtqYXBqd9Sh3BYwJJzjw"
+user_id = "2103446848"
+
+
+headers = {"Authorization":Authorization,
+           "Content-Type":"application/json"}
+
+
+def chat_agent(msg,retry_time=3):
+
+    for _ in range(retry_time):
+        try:
+
+            data_raw = {"bot_id":agent_id,
+                        "user_id":user_id,
+                        "stream":True,
+                        "auto_save_history":True,
+                        "additional_messages":[
+                            {
+                                "role":"user",
+                                "content":msg,
+                                "content_type":"text"
+                            }
+                        ]}
+
+            resp = requests.post(url,headers=headers,json=data_raw,timeout=60)
+
+            _data = resp.content.decode("utf-8")
+
+            return decode_msg(_data)
+
+        except Exception as e:
+            traceback.print_exc()
+            time.sleep(1)
+
+
+msg_list_url = "https://api.coze.cn/v3/chat/message/list"
+
+
+def get_msg_list(conversation_id,chat_id,retry_time=3):
+
+    for _ in range(retry_time):
+        try:
+
+            data_raw = {"conversation_id":conversation_id,
+                        "chat_id":chat_id}
+
+            resp = requests.post(msg_list_url,headers=headers,json=data_raw)
+
+            _data = _result.content.decode("utf-8")
+
+            return decode_msg(_data)
+        except Exception as e:
+            traceback.print_exc()
+            time.sleep(1)
+
+
+def decode_msg(content):
+    list_msg = content.split("\n\n")
+    response = ""
+    for msg in list_msg:
+        try:
+            list_line = msg.split("\n")
+            if len(list_line) ==2:
+                msg_type = list_line[0]
+                msg_data = json.loads(list_line[1][5:])
+                if msg_type=="event:conversation.message.delta":
+                    response += msg_data.get("content")
+
+        except Exception as e:
+            pass
+    return response
+
+
+if __name__ == '__main__':
+    msg = '''
+    上海市青浦区华新镇社区事务受理服务中心的完整名称和统一信用代码是多少,还有对齐原因,以json格式给出准确的结果
+    '''
+    _result = chat_agent(msg)
+    print(_result)
+    # _result = get_msg_list('7458566549444739098','7458566549444722714')
+    # print(_result.content.decode("utf-8"))

文件差異過大導致無法顯示
+ 112 - 0
mining/DoubaoUtils.py


+ 85 - 0
mining/chatUtil.py

@@ -0,0 +1,85 @@
+#coding:utf8
+
+from bs4 import BeautifulSoup
+import re
+
+def html2text(_html):
+
+    if type(_html)==str:
+        _soup = BeautifulSoup(_html,"lxml")
+    else:
+        _soup = _html
+    list_table = _soup.find_all("table")
+    list_tbody = _soup.find_all("tbody")
+    if len(list_table)>0 or len(list_tbody)>0:
+        list_childs = _soup.find_all(recursive=False)
+        list_child_text = []
+        for child in list_childs:
+            list_child_text.append(html2text(child))
+        return "\n".join(list_child_text)
+
+    else:
+        if _soup.name=="table" or _soup.name=="tbody":
+            _table_text = ""
+            trs = _soup.find_all("tr")
+            list_tr_text = []
+            for tr in trs:
+                tds = tr.find_all("th")
+                if len(tds)>0:
+                    list_td_text = []
+                    for td in tds:
+                        list_td_text.append(re.sub('\s','',td.get_text()))
+                    list_tr_text.append("|".join(list_td_text))
+                tds = tr.find_all("td")
+                if len(tds)>0:
+                    list_td_text = []
+                    for td in tds:
+                        list_td_text.append(re.sub('\s','',td.get_text()))
+                    list_tr_text.append("|".join(list_td_text))
+            _table_text = "%s\n\n"%"\n".join(list_tr_text)
+            if _table_text == "":
+                _table_text = _soup.get_text()
+            _soup.decompose()
+            return _table_text
+        else:
+            _text = re.sub('\s','',_soup.get_text().strip())
+            _soup.decompose()
+            return _text
+
+def table2list(_html):
+    if type(_html)==str:
+        _soup = BeautifulSoup(_html,'lxml')
+    else:
+        _soup = _html
+    if _soup.name=="table" or _soup.name=="tbody":
+        _table_text = ""
+        trs = _soup.find_all("tr")
+        list_tr_text = []
+        for tr in trs:
+            tds = tr.find_all("th")
+            if len(tds)>0:
+                list_td_text = []
+                for td in tds:
+                    list_td_text.append(re.sub('\s','',td.get_text()))
+                if len(list_td_text)>0:
+                    list_tr_text.append(list_td_text)
+            tds = tr.find_all("td")
+            if len(tds)>0:
+                list_td_text = []
+                for td in tds:
+                    list_td_text.append(re.sub('\s','',td.get_text()))
+                if len(list_td_text)>0:
+                    list_tr_text.append(list_td_text)
+        return list_tr_text
+
+def tableList2text(table_list):
+    list_tr_text = []
+    for tr in table_list:
+        tds = tr
+        if len(tds)>0:
+            list_td_text = []
+            for td in tds:
+                list_td_text.append(re.sub('\s','',td))
+            list_tr_text.append("|".join(list_td_text))
+    _table_text = "%s\n\n"%"\n".join(list_tr_text)
+    return _table_text

+ 24 - 13
utils/ERNIE_utils.py

@@ -8,6 +8,7 @@ def get_access_token():
     """
 
     url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=gnwVXv96An9qMYqq9eWbeNqk&client_secret=mDsRQbCPsV4N7x28LbwkhTAaLmrrDnXk"
+    url = "https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=Ok8QMe4qIQOAex0F9Gf1uns0&client_secret=6DjGGDdvhnBaEOMdSXAg02KxZnQhWpbd"
 
     payload = json.dumps("")
     headers = {
@@ -19,17 +20,18 @@ def get_access_token():
     return response.json().get("access_token")
 
 def main():
-    # _token = get_access_token()
-    _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
+    _token = get_access_token()
+    # _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
     url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token=" + _token
 
+    # url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/xuanyuan_70b_chat?access_token=" + _token
+
     payload = json.dumps({
         "messages": [
             {
                 "role": "user",
                 "content": '''
-                假设分类是建筑建材-建筑涂料的相关产品词“面漆”
-                请拓展其相关行业产品词,列举30个
+               今天是几号
                 '''
             }
         ]
@@ -38,30 +40,39 @@ def main():
         'Content-Type': 'application/json'
     }
 
+
+
+
     response = requests.request("POST", url, headers=headers, data=payload)
 
     print(response.text)
 
-def chat(message):
-    _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
-    url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions?access_token=" + _token
-
+def chat(msg,token=None,api_url=None):
+    if token is None:
+        token = get_access_token()
+    # _token = "24.93c9d66ffc94ffaef6c6c9d35770a5f5.2592000.1701242081.282335-37357318"
+    if api_url is None:
+        api_url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions"
+        # api_url = "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/ernie-3.5-128k"
+    url =  api_url+"?access_token="+ token
     payload = json.dumps({
         "messages": [
             {
                 "role": "user",
                 "content": '''
-                %s
-                '''%message
+               %s
+                '''%msg
             }
-        ]
+        ],
+        "stream":False
     })
     headers = {
         'Content-Type': 'application/json'
     }
-
     response = requests.request("POST", url, headers=headers, data=payload)
+
     return response
 
 if __name__ == '__main__':
-    main()
+    # main()
+    print(chat("今天是几号").text)

部分文件因文件數量過多而無法顯示