Browse Source

初始提交

luojiehua 4 năm trước cách đây
commit
a42b1ec801

+ 1 - 0
.gitignore

@@ -0,0 +1 @@
+/data/

+ 6 - 0
.idea/.gitignore

@@ -0,0 +1,6 @@
+# Default ignored files
+/workspace.xml
+
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

+ 15 - 0
.idea/codeStyles/Project.xml

@@ -0,0 +1,15 @@
+<component name="ProjectCodeStyleConfiguration">
+  <code_scheme name="Project" version="173">
+    <codeStyleSettings language="JAVA">
+      <indentOptions>
+        <option name="TAB_SIZE" value="2" />
+      </indentOptions>
+    </codeStyleSettings>
+    <codeStyleSettings language="Python">
+      <indentOptions>
+        <option name="TAB_SIZE" value="2" />
+        <option name="SMART_TABS" value="true" />
+      </indentOptions>
+    </codeStyleSettings>
+  </code_scheme>
+</component>

+ 5 - 0
.idea/codeStyles/codeStyleConfig.xml

@@ -0,0 +1,5 @@
+<component name="ProjectCodeStyleConfiguration">
+  <state>
+    <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
+  </state>
+</component>

+ 36 - 0
.idea/dataSources.xml

@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
+    <data-source source="LOCAL" name="mysql_projuct_阿里云" uuid="2adea2cb-6cb9-4965-868e-db20993c47f2">
+      <driver-ref>mysql.8</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
+      <jdbc-url>jdbc:mysql://rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com:3306/bxkc</jdbc-url>
+    </data-source>
+    <data-source source="LOCAL" name="bxkc@47.98.60.3" uuid="55d18506-80f5-4270-88d3-bac5db11eb93">
+      <driver-ref>mongo</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>com.dbschema.MongoJdbcDriver</jdbc-driver>
+      <jdbc-url>mongodb://47.98.60.3:17017/bxkc</jdbc-url>
+    </data-source>
+    <data-source source="LOCAL" name="oracle 生产库" uuid="d47b5be6-14c1-4fd3-b108-75141d1efa10">
+      <driver-ref>oracle</driver-ref>
+      <synchronize>true</synchronize>
+      <auto-commit>false</auto-commit>
+      <jdbc-driver>oracle.jdbc.OracleDriver</jdbc-driver>
+      <jdbc-url>jdbc:oracle:thin:@121.46.18.113:10522:yanphone</jdbc-url>
+    </data-source>
+    <data-source source="LOCAL" name="@192.168.2.101" uuid="8ea2c81f-5933-4c9e-b7d4-c10c033d46a2">
+      <driver-ref>postgresql</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>org.postgresql.Driver</jdbc-driver>
+      <jdbc-url>jdbc:postgresql://192.168.2.101:5432/</jdbc-url>
+    </data-source>
+    <data-source source="LOCAL" name="mysql_测试" uuid="19d8fc36-28e0-4de8-bed4-c356c7fd53cb">
+      <driver-ref>mysql.8</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>com.mysql.cj.jdbc.Driver</jdbc-driver>
+      <jdbc-url>jdbc:mysql://192.168.2.170:3306</jdbc-url>
+    </data-source>
+  </component>
+</project>

+ 15 - 0
.idea/encodings.xml

@@ -0,0 +1,15 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Encoding">
+    <file url="file://$PROJECT_DIR$/data/%s-%s_tenderee_doctitle.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/data/2017-01-01-2022-12-23_tenderee_doctitle.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/data/enterprise_2017.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/data/enterprise_2017_1.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/data/exportArticle1.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/data/exportArticle1_title.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/data/exportFind_tenderee.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/data/exportFind_tenderee1.csv" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/data/服务型客户.txt" charset="GBK" />
+    <file url="PROJECT" charset="GBK" />
+  </component>
+</project>

+ 6 - 0
.idea/misc.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (dl_nlp)" project-jdk-type="Python SDK">
+    <output url="file://$PROJECT_DIR$/out" />
+  </component>
+</project>

+ 8 - 0
.idea/modules.xml

@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/DataMining.iml" filepath="$PROJECT_DIR$/DataMining.iml" />
+    </modules>
+  </component>
+</project>

+ 6 - 0
.idea/sqldialects.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="SqlDialectMappings">
+    <file url="file://G:/要素提取标注备份/iepy_public_auth_group.sql" dialect="PostgreSQL" />
+  </component>
+</project>

+ 6 - 0
.idea/vcs.xml

@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

+ 9 - 0
DataMining.iml

@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

+ 265 - 0
comment/Enterprise.java

@@ -0,0 +1,265 @@
+package site.dunhanson.aliyun.tablestore.entity.bidi.enterprise;
+import lombok.Getter;
+import lombok.Setter;
+import lombok.ToString;
+import java.util.List;
+/**
+ * @author jiangzf
+ * @date 2020/08/09
+ * @description 企业相关信息(ots.enterprise 表的实体)
+ */
+@Getter
+@Setter
+@ToString
+public class Enterprise {
+    /*企业名称*/
+    private String name;
+    /*比地ID(各个数据库的关联)*/
+    private Long bidiId;
+    /*区域*/
+    private String area;
+    /*省份*/
+    private String province;
+    /*城市*/
+    private String city;
+    /*区县*/
+    private String district;
+    /*总的招投标数量 = 招标数量 + 代理数量 + 中标数量 + 投标数量*/
+    private Integer bidNumber;
+    /*招标数量*/
+    private Integer zhaoBiaoNumber;
+    /*代理数量*/
+    private Integer daiLiNumber;
+    /*中标数量*/
+    private Integer zhongBiaoNumber;
+    /*投标数量*/
+    private Integer touBiaoNumber;
+    /*这个属性用于旧数据的升级(null和0:待升级、-1:升级中、1:已升级)*/
+    private Integer upgradeStatus;
+    /*天眼查ID*/
+    private Long tycId;
+    /*机构信用代码*/
+    private String creditCode;
+    /*法人*/
+    private String legalPerson;
+    /*企业地址*/
+    private String address;
+    /*电话*/
+    private String phone;
+    /*邮箱*/
+    private String email;
+    /*注册日期*/
+    private String foundDate;
+    /*注册资金*/
+    private String regCapital;
+    /*公司类型 1-公司,2-香港公司,3-社会组织,4-律所,5-事业单位,6-基金会(NUMBER),*/
+    private Integer companyType;
+    /*开业时间*/
+    private String estiblishTime;
+    /*公司简介*/
+    private String description;
+    /*天眼查数据库更新时间(STRING|yyyy-mm-dd)*/
+    private String tycUpdateTime;
+    /*人员规模*/
+    private String staffNumRange;
+    /*经营开始时间*/
+    private Long fromTime;
+    /*万分制	行业分数*/
+    private Integer categoryScore;
+    /*股票名*/
+    private String bondName;
+    /*是否是小微企业 0不是 1是*/
+    private Integer isMicroEnt;
+    /*股票曾用名*/
+    private String usedBondName;
+    /*注册号*/
+    private String regNumber;
+    /*企业评分*/
+    private Long percentileScore;
+    /*登记机关*/
+    private String regInstitute;
+    /*注册地址*/
+    private String regLocation;
+    /*行业*/
+    private String industry;
+    /*核准时间*/
+    private Long approvedTime;
+    /*参保人数*/
+    private Long socialStaffNum;
+    /*企业标签*/
+    private String tags;
+    /*logo(不建议使用)*/
+    private String logo;
+    /*纳税人识别号*/
+    private String taxNumber;
+    /*经营范围*/
+    private String businessScope;
+    /*英文名*/
+    private String property3;
+    /*简称*/
+    private String alias;
+    /*组织机构代码*/
+    private String orgNumber;
+    /*企业状态*/
+    private String regStatus;
+    /*股票类型*/
+    private String bondType;
+    /*法人(法人优先使用此字段)*/
+    private String legalPersonName;
+    /*经营结束时间*/
+    private Long toTime;
+    /*法人id*/
+    private Long legalPersonId;
+    /*数据来源标志*/
+    private String sourceFlag;
+    /*实收注册资金*/
+    private String actualCapital;
+    /*新公司名id*/
+    private String tycCorrectCompanyId;
+    /*企业类型*/
+    private String companyOrgType;
+    /*抓取数据的时间*/
+    private Long tycUpdateTimes;
+    /*companyId*/
+    private Long tycCompanyId;
+    /*曾用名*/
+    private String historyNames;
+    /*股票号*/
+    private String bondNum;
+    /*注册资本币种 人民币 美元 欧元 等*/
+    private String regCapitalCurrency;
+    /*实收注册资本币种 人民币 美元 欧元 等*/
+    private String actualCapitalCurrency;
+    /*网址*/
+    private String websiteList;
+    /*联系方式*/
+    private String phoneNumber;
+    /*吊销日期*/
+    private Long revokeDate;
+    /*吊销原因*/
+    private String revokeReason;
+    /*注销日期*/
+    private Long cancelDate;
+    /*注销原因*/
+    private String cancelReason;
+    /*主要人员总数*/
+    private Long mainStaffsNumber;
+
+
+    /*主要人员*/
+    private List<EnterpriseProfilePrimaryStaffItem> staffs;
+    /*股东总数*/
+    private Long mainHoldersNumber;
+    /*主要股东*/
+    private List<EnterpriseProfileShareHolderInfoItem> holders;
+    /*变更总数*/
+    private Long changeInfoNumber;
+    /*变更记录*/
+    private List<EnterpriseProfileChangeInfoItem> changeInfo;
+    /*分支机构总数*/
+    private Integer branchNumber;
+    /*分支机构*/
+    private List<EnterpriseProfileBranchItem> branches;
+    /*专利数量*/
+    private Integer patentsNumber;
+    /*专利*/
+    private List<EnterpriseProfilePatentItem> patents;
+    //著作权数量
+    private Integer copyRegWorksNumber;
+    /*著作权*/
+    private List<EnterpriseProfileCopyrightOfWorksItem> copyRegWorks;
+    /*经营异常数量*/
+    private Integer abnormalNumber;
+    /*经营异常*/
+    private List<EnterpriseProfileAbnormalItem> abnormals;
+    /*行政处罚数量*/
+    private Integer punishmentInfoNumber;
+    /*行政处罚*/
+    private List<EnterpriseProfilePunishmentInfoItem> punishmentInfo;
+    /*行政处罚数量--信用中国(新版)*/
+    private Integer creditChinaV2Number;
+    /*行政处罚--信用中国(新版)*/
+    private List<EnterpriseProfileCreditChinaItem> creditChinaV2;
+    /*严重违法数量*/
+    private Integer illegalinfoNumber;
+    /*严重违法*/
+    private List<EnterpriseProfileIllegalInfoItem> illegalinfo;
+    /*股权出质数量*/
+    private Integer equityInfoNumber;
+    /*股权出质*/
+    private List<EnterpriseProfileEquityInfoItem> equityInfo;
+    /*动产抵押数量*/
+    private Integer mortgageInfoNumber;
+    /*动产抵押*/
+    private List<EnterpriseProfileMortgageInfoItem> mortgageInfo;
+    /*欠税公告数量*/
+    private Integer ownTaxNumber;
+    /*欠税公告*/
+    private List<EnterpriseProfileOwnTaxItem> ownTax;
+    /*法律诉讼数量*/
+    private Integer lawSuitNumber;
+    /*法律诉讼*/
+    private List<EnterpriseProfileLawSuitItem> lawSuit;
+    /*法院公告数量*/
+    private Integer courtAnnouncementNumber;
+    /*法院公告*/
+    private List<EnterpriseProfileCourtAnnouncementItem> courtAnnouncement;
+    /*失信人数量*/
+    private Integer dishonestNumber;
+    /*失信人*/
+    private List<EnterpriseProfileDishonestItem> dishonest;
+    /*被执行人数量*/
+    private Integer zhixinginfoNumber;
+    /*被执行人*/
+    private List<EnterpriseProfileZhixingInfoItem> zhixinginfo;
+    /*资质数量*/
+    private Integer qualificationsNumber;
+    /*资质更新时间(STRING|yyyy-mm-dd)*/
+    private String qualificationsUpdateTime;
+    /*资质列表*/
+    private List<EnterpriseProfileQualificationItem> qualifications;
+    /*工程人员数量*/
+    private Integer registeredStaffsNumber;
+    /*工程人员*/
+    private List<EnterpriseProfileRegisteredStaffItem> registeredStaffs;
+    /*工程项目数量*/
+    private Integer quaProjectsNumber;
+    /*工程项目*/
+    private List<EnterpriseProfileQualificationProjectItem> quaProjects;
+    /*不良行为数量*/
+    private Integer badCreditBehaviorsNumber;
+    /*不良行为*/
+    private List<EnterpriseProfileBadCreditBehaviorItem> badCreditBehaviors;
+    /*良好行为数量*/
+    private Integer goodCreditBehaviorsNumber;
+    /*良好行为*/
+    private List<EnterpriseProfileGoodCreditBehaviorItem> goodCreditBehaviors;
+    /*黑名单记录数量*/
+    private Integer creditBlackRecordsNumber;
+    /*黑名单记录*/
+    private List<EnterpriseProfileCreditBlackRecordItem> creditBlackRecords;
+    /*变更记录数量*/
+    private Integer quaChangeLogsNumber;
+    /*变更记录*/
+    private List<EnterpriseProfileQuaChangeLogItem> quaChangeLogs;
+    /*信用中国数据更新时间(STRING|yyyy-mm-dd),*/
+    private String creditInfoUpdateTime;
+    /*行政许可数量*/
+    private Integer adminLicensesNumber;
+    /*行政许可*/
+    private List<EnterpriseProfileCreditChinaAdminLicenseItem> adminLicenses;
+    /*行政处罚数量*/
+    private Integer adminPenaltiesNumber;
+    /*行政处罚*/
+    private List<EnterpriseProfileCreditChinaAdminPenaltieItem> adminPenalties;
+    /*守信红名单数量*/
+    private Integer creditRedRecordNumber;
+    /*守信红名单*/
+    private List<EnterpriseProfileCreditChinaRedRecordItem> creditRedRecord;
+    /*招标代理id,百度那些已经收录的使用 agent_detail_info.id 新增的请使用 bidiId*/
+    private Long zbdlId;
+    /*旧的以 agent_contact_person 为准, 新增以 agent_legal_person 为准*/
+    private String contactPerson;
+    /*合作案例*/
+    private List<AgentEventInfo> agentEventInfos;
+}

+ 229 - 0
comment/Organization.java

@@ -0,0 +1,229 @@
+package com.bidizhaobiao.data.bigdata.persistence.orgproject.entity;
+
+import org.neo4j.ogm.annotation.GeneratedValue;
+import org.neo4j.ogm.annotation.Id;
+import org.neo4j.ogm.annotation.NodeEntity;
+import org.neo4j.ogm.annotation.Property;
+
+@NodeEntity(label = "Organization")
+public class Organization {
+
+    @Id
+    @GeneratedValue
+    private Long nodeId;
+    @Property(name = "name")
+    private String name;
+
+    /**
+     * 此企业的别名集合(多个使用逗号分隔)
+     */
+    @Property(name = "nicknames")
+    private String nicknames;
+    @Property(name = "area")
+    private String area;
+    @Property(name = "province")
+    private String province;
+    @Property(name = "city")
+    private String city;
+    @Property(name = "district")
+    private String district;
+    // 此属性用于解决bug 将一些数据重新跑入其他库的时候使用 用完就可以删除对应的属性
+    @Property(name = "status")
+    private String status;
+
+    /**
+     * 旗下招标相同的项目的合并状态(1 已经合并、-1 合并中、0或者空 待合并,用于旧数据的处理,处理好了此属性可以删掉)
+     */
+    private Integer mergeStatus;
+
+    /**
+     * 这个属性用于旧数据的升级(null和0:待升级、-1:升级中、1:已升级)
+     */
+    private Integer upgradeStatus;
+
+    private Integer bidNumber;           // 总的招投标数量 = 招标数量 + 代理数量 + 中标数量 + 投标数量
+    private Integer zhaoBiaoNumber;     // 招标数量
+    private Integer daiLiNumber;        // 代理数量
+    private Integer zhongBiaoNumber;    // 中标数量
+    private Integer touBiaoNumber;      // 投标数量
+
+    /**
+     * 比地id(即:企业在我们这边的一个唯一标识)
+     */
+    private Long bidiId;
+
+    /**
+     * 注册资金
+     */
+    private String regCapital;
+
+    /**
+     * 注册时间
+     */
+    private String estiblishTime;
+
+
+    public Organization() {
+    }
+
+    public Organization(String name) {
+        this.name = name;
+    }
+
+    public Long getNodeId() {
+        return nodeId;
+    }
+
+    public void setNodeId(Long nodeId) {
+        this.nodeId = nodeId;
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    public void setName(String name) {
+        this.name = name;
+    }
+
+    public String getArea() {
+        return area;
+    }
+
+    public void setArea(String area) {
+        this.area = area;
+    }
+
+    public String getProvince() {
+        return province;
+    }
+
+    public void setProvince(String province) {
+        this.province = province;
+    }
+
+    public String getCity() {
+        return city;
+    }
+
+    public void setCity(String city) {
+        this.city = city;
+    }
+
+    public String getDistrict() {
+        return district;
+    }
+
+    public void setDistrict(String district) {
+        this.district = district;
+    }
+
+    public String getStatus() {
+        return status;
+    }
+
+    public void setStatus(String status) {
+        this.status = status;
+    }
+
+    public String getNicknames() {
+        return nicknames;
+    }
+
+    public void setNicknames(String nicknames) {
+        this.nicknames = nicknames;
+    }
+
+    public Integer getMergeStatus() {
+        return mergeStatus == null ? 0 : mergeStatus;
+    }
+
+    public void setMergeStatus(Integer mergeStatus) {
+        this.mergeStatus = mergeStatus;
+    }
+
+    public Integer getUpgradeStatus() {
+        return upgradeStatus;
+    }
+
+    public void setUpgradeStatus(Integer upgradeStatus) {
+        this.upgradeStatus = upgradeStatus;
+    }
+
+    public Integer getBidNumber() {
+        return bidNumber;
+    }
+
+    public void setBidNumber(Integer bidNumber) {
+        this.bidNumber = bidNumber;
+    }
+
+    public Integer getZhaoBiaoNumber() {
+        return zhaoBiaoNumber;
+    }
+
+    public void setZhaoBiaoNumber(Integer zhaoBiaoNumber) {
+        this.zhaoBiaoNumber = zhaoBiaoNumber;
+    }
+
+    public Integer getDaiLiNumber() {
+        return daiLiNumber;
+    }
+
+    public void setDaiLiNumber(Integer daiLiNumber) {
+        this.daiLiNumber = daiLiNumber;
+    }
+
+    public Integer getZhongBiaoNumber() {
+        return zhongBiaoNumber;
+    }
+
+    public void setZhongBiaoNumber(Integer zhongBiaoNumber) {
+        this.zhongBiaoNumber = zhongBiaoNumber;
+    }
+
+    public Integer getTouBiaoNumber() {
+        return touBiaoNumber;
+    }
+
+    public void setTouBiaoNumber(Integer touBiaoNumber) {
+        this.touBiaoNumber = touBiaoNumber;
+    }
+
+    public Long getBidiId() {
+        return bidiId;
+    }
+
+    public void setBidiId(Long bidiId) {
+        this.bidiId = bidiId;
+    }
+
+    public String getRegCapital() {
+        return regCapital;
+    }
+
+    public void setRegCapital(String regCapital) {
+        this.regCapital = regCapital;
+    }
+
+    public String getEstiblishTime() {
+        return estiblishTime;
+    }
+
+    public void setEstiblishTime(String estiblishTime) {
+        this.estiblishTime = estiblishTime;
+    }
+
+    /**
+     * 计算 总的招投标数
+     */
+    public void calculateBidNumber() {
+        zhaoBiaoNumber = zhaoBiaoNumber == null ? 0 : zhaoBiaoNumber;
+        daiLiNumber = daiLiNumber == null ? 0 : daiLiNumber;
+        zhongBiaoNumber = zhongBiaoNumber == null ? 0 : zhongBiaoNumber;
+        touBiaoNumber = touBiaoNumber == null ? 0 : touBiaoNumber;
+
+        bidNumber = zhaoBiaoNumber + daiLiNumber + zhongBiaoNumber + touBiaoNumber;
+    }
+
+}

+ 443 - 0
comment/Project.java

@@ -0,0 +1,443 @@
+package com.bidizhaobiao.data.bigdata.persistence.orgproject.entity;
+
+import org.apache.commons.collections4.CollectionUtils;
+import org.neo4j.ogm.annotation.GeneratedValue;
+import org.neo4j.ogm.annotation.Id;
+import org.neo4j.ogm.annotation.NodeEntity;
+import org.neo4j.ogm.annotation.Property;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.*;
+
+@NodeEntity(label = "Project")
+public class Project {
+    // -------------------------------------- 招中标共有的属性
+    @Id
+    @GeneratedValue
+    private Long nodeId;
+    // 项目名称,项目编号,专业分类,地区
+    @Property(name = "project_name")
+    private String projectName;
+    @Property(name = "project_code")
+    private String projectCode;
+    @Property(name = "industry")
+    private String industry;
+    @Property(name = "info_type")
+    private String infoType;
+    @Property(name = "project_addr")
+    private String projectAddr;
+    // 项目归属地区
+    @Property(name = "area")
+    private String area;
+    // 项目归属省份
+    @Property(name = "province")
+    private String province;
+    // 项目归属城市
+    @Property(name = "city")
+    private String city;
+    // 项目归属区县
+    @Property(name = "district")
+    private String district;
+    @Property(name = "bidding_budget")
+    // 招标预算价格
+    private String biddingBudget;
+    // 子项目编码、子项目名称、子项目名称集合
+    @Property(name = "sub_project_code")
+    private String subProjectCode;
+    @Property(name = "sub_project_name")
+    private String subProjectName;
+    @Property(name = "sub_project_name_alias")
+    private String subProjectNameAlias;
+
+    private Integer upgradeStatus;      // 这个属性用于旧数据的升级(null和0:待升级、-1:升级中、1:已升级)
+
+
+
+    // 招标公告特有属性 招标标题/招标uuid/招标公告id/招标发布时间(已经被统一成yyyy-MM-dd格式了)
+    @Property(name = "zhao_biao_name")
+    private String zhaoBiaoName;
+    @Property(name = "zhao_biao_uuid")
+    private String zhaoBiaoUuid;
+    @Property(name = "zhao_biao_id")
+    private String zhaoBiaoId;
+    @Property(name = "zhao_biao_page_time")
+    private String zhaoBiaoPageTime;
+
+
+    // 中标公告特有属性 中标标题/中标uuid/中标公告id/中标发布时间(已经被统一成yyyy-MM-dd格式了)/中标价格/是否废标公告
+    @Property(name = "zhong_biao_name")
+    private String zhongBiaoName;
+    @Property(name = "zhong_biao_uuid")
+    private String zhongBiaoUuid;
+    @Property(name = "zhong_biao_id")
+    private String zhongBiaoId;
+    @Property(name = "zhong_biao_page_time")
+    private String zhongBiaoPageTime;
+    @Property(name = "win_bid_price")
+    private String winBidPrice;
+    @Property(name = "is_fei_biao")
+    private String isFeiBiao;
+
+    /**
+     * 调用 addAllByExclude 时需要排除的属性
+     */
+    public Project() {
+        this.isFeiBiao = "false";
+    }
+
+
+    public Long getNodeId() {
+        return nodeId;
+    }
+
+    public void setNodeId(Long nodeId) {
+        this.nodeId = nodeId;
+    }
+
+    public String getProjectName() {
+        return projectName;
+    }
+
+    public void setProjectName(String projectName) {
+        this.projectName = projectName;
+    }
+
+    public String getIndustry() {
+        return industry;
+    }
+
+    public void setIndustry(String industry) {
+        this.industry = industry;
+    }
+
+    public String getProjectAddr() {
+        return projectAddr;
+    }
+
+    public void setProjectAddr(String projectAddr) {
+        this.projectAddr = projectAddr;
+    }
+
+    public String getZhaoBiaoName() {
+        return zhaoBiaoName;
+    }
+
+    public void setZhaoBiaoName(String zhaoBiaoName) {
+        this.zhaoBiaoName = zhaoBiaoName;
+    }
+
+    public String getZhaoBiaoPageTime() {
+        return zhaoBiaoPageTime;
+    }
+
+    public void setZhaoBiaoPageTime(String zhaoBiaoPageTime) {
+        this.zhaoBiaoPageTime = zhaoBiaoPageTime;
+    }
+
+    public String getBiddingBudget() {
+        return biddingBudget;
+    }
+
+    public void setBiddingBudget(String biddingBudget) {
+        this.biddingBudget = biddingBudget;
+    }
+
+    public String getWinBidPrice() {
+        return winBidPrice;
+    }
+
+    public void setWinBidPrice(String winBidPrice) {
+        this.winBidPrice = winBidPrice;
+    }
+
+    public String getZhongBiaoName() {
+        return zhongBiaoName;
+    }
+
+    public void setZhongBiaoName(String zhongBiaoName) {
+        this.zhongBiaoName = zhongBiaoName;
+    }
+
+    public String getZhongBiaoPageTime() {
+        return zhongBiaoPageTime;
+    }
+
+    public void setZhongBiaoPageTime(String zhongBiaoPageTime) {
+        this.zhongBiaoPageTime = zhongBiaoPageTime;
+    }
+
+    public String getProjectCode() {
+        return projectCode;
+    }
+
+    public void setProjectCode(String projectCode) {
+        this.projectCode = projectCode;
+    }
+
+    public String getZhaoBiaoUuid() {
+        return zhaoBiaoUuid;
+    }
+
+    public void setZhaoBiaoUuid(String zhaoBiaoUuid) {
+        this.zhaoBiaoUuid = zhaoBiaoUuid;
+    }
+
+    public String getZhongBiaoUuid() {
+        return zhongBiaoUuid;
+    }
+
+    public void setZhongBiaoUuid(String zhongBiaoUuid) {
+        this.zhongBiaoUuid = zhongBiaoUuid;
+    }
+
+    public String getIsFeiBiao() {
+        return isFeiBiao;
+    }
+
+    public void setIsFeiBiao(String isFeiBiao) {
+        this.isFeiBiao = isFeiBiao;
+    }
+
+    public String getSubProjectName() {
+        return subProjectName;
+    }
+
+    public void setSubProjectName(String subProjectName) {
+        this.subProjectName = subProjectName;
+    }
+
+    public String getSubProjectNameAlias() {
+        return subProjectNameAlias;
+    }
+
+    public void setSubProjectNameAlias(String subProjectNameAlias) {
+        this.subProjectNameAlias = subProjectNameAlias;
+    }
+
+    public String getInfoType() {
+        return infoType;
+    }
+
+    public void setInfoType(String infoType) {
+        this.infoType = infoType;
+    }
+
+    public String getSubProjectCode() {
+        return subProjectCode;
+    }
+
+    public void setSubProjectCode(String subProjectCode) {
+        this.subProjectCode = subProjectCode;
+    }
+
+    public String getArea() {
+        return area;
+    }
+
+    public void setArea(String area) {
+        this.area = area;
+    }
+
+    public String getProvince() {
+        return province;
+    }
+
+    public void setProvince(String province) {
+        this.province = province;
+    }
+
+    public String getCity() {
+        return city;
+    }
+
+    public void setCity(String city) {
+        this.city = city;
+    }
+
+    public String getDistrict() {
+        return district;
+    }
+
+    public void setDistrict(String district) {
+        this.district = district;
+    }
+
+    public String getZhaoBiaoId() {
+        return zhaoBiaoId;
+    }
+
+    public void setZhaoBiaoId(String zhaoBiaoId) {
+        this.zhaoBiaoId = zhaoBiaoId;
+    }
+
+    public String getZhongBiaoId() {
+        return zhongBiaoId;
+    }
+
+    public void setZhongBiaoId(String zhongBiaoId) {
+        this.zhongBiaoId = zhongBiaoId;
+    }
+
+    public Integer getUpgradeStatus() {
+        return upgradeStatus;
+    }
+
+    public void setUpgradeStatus(Integer upgradeStatus) {
+        this.upgradeStatus = upgradeStatus;
+    }
+
+    /**
+     * 获取文档的uuid(招标uuid或者中标uuid)
+     * @return
+     */
+    public String getUuid() {
+        return zhaoBiaoUuid != null ? zhaoBiaoUuid : zhongBiaoUuid;
+    }
+
+    /**
+     * 加入招标公告特有的属性
+     * @param project {@link Project}
+     */
+    public void addZhaoBiaoElement(Project project) {
+        if (project != null) {
+            try {
+                addAllByExclude(project, null);
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+            this.setZhaoBiaoName(project.getZhaoBiaoName());
+            this.setZhaoBiaoUuid(project.getZhaoBiaoUuid());
+            this.setZhaoBiaoId(project.getZhaoBiaoId());
+            this.setZhaoBiaoPageTime(project.getZhaoBiaoPageTime());
+        }
+    }
+
+
+
+    /**
+     * 加入中标公告特有的属性
+     * @param project {@link Project}
+     */
+    public void addZhongBiaoElement(Project project) {
+        if (project != null) {
+            try {
+                addAllByExclude(project, null);
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+            this.setZhongBiaoName(project.getZhongBiaoName());
+            this.setZhongBiaoUuid(project.getZhongBiaoUuid());
+            this.setZhongBiaoId(project.getZhongBiaoId());
+            this.setZhongBiaoPageTime(project.getZhongBiaoPageTime());
+            this.setWinBidPrice(project.getWinBidPrice());
+            this.setIsFeiBiao(project.getIsFeiBiao());
+        }
+    }
+
+    /**
+     * 加入招中标共有的属性
+     * @param project {@link Project}
+     */
+    private void addCommonElement(Project project) {
+        if (project != null) {
+            this.setProjectName(project.getProjectName());
+            this.setProjectCode(project.getProjectCode());
+            this.setIndustry(project.getIndustry());
+            this.setInfoType(project.getInfoType());
+            this.setProjectAddr(project.getProjectAddr());
+            this.setArea(project.getArea());
+            this.setProvince(project.getProvince());
+            this.setCity(project.getCity());
+            this.setDistrict(project.getDistrict());
+            this.setBiddingBudget(project.getBiddingBudget());
+            this.setSubProjectCode(project.getSubProjectCode());
+            this.setSubProjectName(project.getSubProjectName());
+            this.setSubProjectNameAlias(project.getSubProjectNameAlias());
+            this.setUpgradeStatus(project.getUpgradeStatus());
+        }
+    }
+
+    private List<String> defaultExcludeFields() {
+        List<String> excludeFields = new LinkedList<>();
+        excludeFields.add("nodeId");
+
+        // 招标属性
+        excludeFields.add("zhaoBiaoName");
+        excludeFields.add("zhaoBiaoUuid");
+        excludeFields.add("zhaoBiaoId");
+        excludeFields.add("zhaoBiaoPageTime");
+
+        // 中标属性
+        excludeFields.add("zhongBiaoName");
+        excludeFields.add("zhongBiaoUuid");
+        excludeFields.add("zhongBiaoId");
+        excludeFields.add("zhongBiaoPageTime");
+        excludeFields.add("winBidPrice");
+        excludeFields.add("isFeiBiao");
+        return excludeFields;
+    }
+
+    /**
+     * 把 project 不为空的属性(excludeFields的属性除外除外)赋值给当前对象的对应属性
+     * @param project   目标项目
+     * @throws InvocationTargetException
+     * @throws IllegalAccessException
+     * @throws NoSuchMethodException
+     */
+    public void addAllByExclude(Project project, List<String> excludeFields) throws InvocationTargetException, IllegalAccessException, NoSuchMethodException {
+        if (CollectionUtils.isEmpty(excludeFields)) {
+            excludeFields = defaultExcludeFields();
+        }
+
+        Field[] projectFields = project.getClass().getDeclaredFields();
+        Field[] thisFields = this.getClass().getDeclaredFields();
+
+        Map<String, Field> thisFieldMap = new HashMap<>();
+        for (Field thisField : thisFields) {
+            thisFieldMap.put(thisField.getName(), thisField);
+        }
+
+        for (Field projectField : projectFields) {
+            String projectFieldName = projectField.getName();
+            if (excludeFields.contains(projectFieldName)) {
+                continue;
+            }
+            Field thisField = thisFieldMap.get(projectFieldName);
+            thisField.setAccessible(true);
+            projectField.setAccessible(true);
+            if (thisField != null && projectField.get(project) != null) {
+                Method thisMethod = this.getClass().getMethod("set" + projectFieldName.substring(0, 1).toUpperCase() + projectFieldName.substring(1), projectField.getType());
+                thisMethod.invoke(this, projectField.get(project));
+            }
+        }
+    }
+
+    /**
+     * 更新此项目的所有属性(全部以project为准,除了nodeId属性)
+     * @param project
+     * @throws InvocationTargetException
+     * @throws IllegalAccessException
+     * @throws NoSuchMethodException
+     */
+    public void updateAll(Project project) throws InvocationTargetException, IllegalAccessException, NoSuchMethodException {
+        Field[] newFields = project.getClass().getDeclaredFields();
+        Field[] thisFields = this.getClass().getDeclaredFields();
+        for (Field newField : newFields) {
+            String newFieldName = newField.getName();
+            if (newFieldName.equals("nodeId")) {
+                continue;
+            }
+            for (Field thisField : thisFields) {
+                String thisFieldName = thisField.getName();
+                if (thisFieldName.equals(newFieldName)) {
+                    thisField.setAccessible(true);
+                    newField.setAccessible(true);
+                    Method thisMethod = this.getClass().getMethod("set" + thisFieldName.substring(0, 1).toUpperCase() + thisFieldName.substring(1), newField.getType());
+                    thisMethod.invoke(this, newField.get(project));
+                }
+            }
+        }
+    }
+}

+ 316 - 0
comment/ZhaoBiaoExtraction.java

@@ -0,0 +1,316 @@
+package com.bidizhaobiao.data.bigdata.base.entity.mongo;
+
+import org.bson.types.ObjectId;
+import org.mongodb.morphia.annotations.Entity;
+import org.mongodb.morphia.annotations.Id;
+import org.mongodb.morphia.annotations.Property;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+
+@Entity(value = "zhaobiao_extraction", noClassnameStored = true)
+public class ZhaoBiaoExtraction {
+
+    @Id
+    private ObjectId id;
+    // 项目名称
+    @Property("project_name")
+    private String projectName;
+    // 项目编号
+    @Property("project_code")
+    private String projectCode;
+    // 项目地址
+    @Property("project_addr")
+    private String projectAddr;
+    // 公告文档发布日期
+    @Property("page_time")
+    private String pageTime;
+    // 项目归属地区
+    @Property("area")
+    private String area;
+    // 项目归属省份
+    @Property("province")
+    private String province;
+    // 项目归属城市
+    @Property("city")
+    private String city;
+    // 项目归属区县
+    @Property("district")
+    private String district;
+    // 项目归属行业分类(小类)
+    @Property("industry")
+    private String industry;
+    // 项目归属行业分类(大类)
+    @Property("info_type")
+    private String infoType;
+    // 招标预算(或招标控制价)
+    @Property("bidding_budget")
+    private String biddingBudget;
+    // 文章UUID
+    @Property("document_id")
+    private String documentId;
+    // 文章标题(源标题)
+    @Property("document_title")
+    private String documentTitle;
+    // 招标人
+    @Property("tenderee")
+    private String tenderee;
+    // 招标人地址
+    @Property("tenderee_addr")
+    private String tendereeAddr;
+    // 招标人电话
+    @Property("tenderee_phone")
+    private String tendereePhone;
+    // 招标联系人
+    @Property("tenderee_contact")
+    private String tendereeContact;
+    // 代理机构
+    @Property("agency")
+    private String agency;
+    // 代理机构电话
+    @Property("agency_phone")
+    private String agencyPhone;
+    // 代理联系人
+    @Property("agency_contact")
+    private String agencyContact;
+    // 项目子名称
+    @Property("sub_project_name")
+    private String subProjectName;
+    // 项目子编号
+    @Property("sub_project_code")
+    private String subProjectCode;
+
+    private String upgradeStatus;      // 这个属性用于旧数据的升级(null和0:待升级、-1:升级中、1:已升级)
+
+    private String docId;   // 公告id
+
+    public ObjectId getId() {
+        return id;
+    }
+
+    public void setId(ObjectId id) {
+        this.id = id;
+    }
+
+    public String getProjectName() {
+        return projectName;
+    }
+
+    public void setProjectName(String projectName) {
+        this.projectName = projectName;
+    }
+
+    public String getProjectCode() {
+        return projectCode;
+    }
+
+    public void setProjectCode(String projectCode) {
+        this.projectCode = projectCode;
+    }
+
+    public String getProjectAddr() {
+        return projectAddr;
+    }
+
+    public void setProjectAddr(String projectAddr) {
+        this.projectAddr = projectAddr;
+    }
+
+    public String getPageTime() {
+        return pageTime;
+    }
+
+    public void setPageTime(String pageTime) {
+        this.pageTime = pageTime;
+    }
+
+    public String getProvince() {
+        return province;
+    }
+
+    public void setProvince(String province) {
+        this.province = province;
+    }
+
+    public String getIndustry() {
+        return industry;
+    }
+
+    public void setIndustry(String industry) {
+        this.industry = industry;
+    }
+
+    public String getBiddingBudget() {
+        return biddingBudget;
+    }
+
+    public void setBiddingBudget(String biddingBudget) {
+        this.biddingBudget = biddingBudget;
+    }
+
+    public String getDocumentId() {
+        return documentId;
+    }
+
+    public void setDocumentId(String documentId) {
+        this.documentId = documentId;
+    }
+
+    public String getDocumentTitle() {
+        return documentTitle;
+    }
+
+    public void setDocumentTitle(String documentTitle) {
+        this.documentTitle = documentTitle;
+    }
+
+    public String getTenderee() {
+        return tenderee;
+    }
+
+    public void setTenderee(String tenderee) {
+        this.tenderee = tenderee;
+    }
+
+    public String getTendereeAddr() {
+        return tendereeAddr;
+    }
+
+    public void setTendereeAddr(String tendereeAddr) {
+        this.tendereeAddr = tendereeAddr;
+    }
+
+    public String getTendereePhone() {
+        return tendereePhone;
+    }
+
+    public void setTendereePhone(String tendereePhone) {
+        this.tendereePhone = tendereePhone;
+    }
+
+    public String getTendereeContact() {
+        return tendereeContact;
+    }
+
+    public void setTendereeContact(String tendereeContact) {
+        this.tendereeContact = tendereeContact;
+    }
+
+    public String getAgency() {
+        return agency;
+    }
+
+    public void setAgency(String agency) {
+        this.agency = agency;
+    }
+
+    public String getAgencyPhone() {
+        return agencyPhone;
+    }
+
+    public void setAgencyPhone(String agencyPhone) {
+        this.agencyPhone = agencyPhone;
+    }
+
+    public String getAgencyContact() {
+        return agencyContact;
+    }
+
+    public void setAgencyContact(String agencyContact) {
+        this.agencyContact = agencyContact;
+    }
+
+    public String getSubProjectName() {
+        return subProjectName;
+    }
+
+    public void setSubProjectName(String subProjectName) {
+        this.subProjectName = subProjectName;
+    }
+
+    public String getSubProjectCode() {
+        return subProjectCode;
+    }
+
+    public void setSubProjectCode(String subProjectCode) {
+        this.subProjectCode = subProjectCode;
+    }
+
+    public String getInfoType() {
+        return infoType;
+    }
+
+    public void setInfoType(String infoType) {
+        this.infoType = infoType;
+    }
+
+    public String getArea() {
+        return area;
+    }
+
+    public void setArea(String area) {
+        this.area = area;
+    }
+
+    public String getCity() {
+        return city;
+    }
+
+    public void setCity(String city) {
+        this.city = city;
+    }
+
+    public String getDistrict() {
+        return district;
+    }
+
+    public void setDistrict(String district) {
+        this.district = district;
+    }
+
+    public String getDocId() {
+        return docId;
+    }
+
+    public void setDocId(String docId) {
+        this.docId = docId;
+    }
+
+    public String getUpgradeStatus() {
+        return upgradeStatus;
+    }
+
+    public void setUpgradeStatus(String upgradeStatus) {
+        this.upgradeStatus = upgradeStatus;
+    }
+
+    public void addAll(ZhaoBiaoExtraction zhaoBiaoExtraction)  {
+        try {
+            Field[] newFields = zhaoBiaoExtraction.getClass().getDeclaredFields();
+            Field[] thisFields = this.getClass().getDeclaredFields();
+            for (Field newField : newFields) {
+                String newFieldName = newField.getName();
+
+                if (newFieldName.equals("id")) {
+                    continue;
+                }
+
+                for (Field thisField : thisFields) {
+                    String thisFieldName = thisField.getName();
+                    if (thisFieldName.equals(newFieldName)) {
+                        thisField.setAccessible(true);
+                        newField.setAccessible(true);
+                        if (newField.get(zhaoBiaoExtraction) != null) {
+                            Method thisMethod = this.getClass().getMethod("set" + thisFieldName.substring(0, 1).toUpperCase() + thisFieldName.substring(1), newField.getType());
+                            thisMethod.invoke(this, newField.get(zhaoBiaoExtraction));
+                        }
+                        break;
+                    }
+                }
+            }
+        } catch(Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+}

+ 446 - 0
comment/ZhongBiaoExtraction.java

@@ -0,0 +1,446 @@
+package com.bidizhaobiao.data.bigdata.base.entity.mongo;
+
+import org.bson.types.ObjectId;
+import org.mongodb.morphia.annotations.Entity;
+import org.mongodb.morphia.annotations.Id;
+import org.mongodb.morphia.annotations.Property;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+
+@Entity(value = "zhongbiao_extraction", noClassnameStored = true)
+public class ZhongBiaoExtraction {
+
+    @Id
+    private ObjectId id;
+    // 项目名称
+    @Property("project_name")
+    private String projectName;
+    // 项目编号
+    @Property("project_code")
+    private String projectCode;
+    // 项目地址
+    @Property("project_addr")
+    private String projectAddr;
+    // 公告文档发布日期
+    @Property("page_time")
+    private String pageTime;
+    // 项目归属地区
+    @Property("area")
+    private String area;
+    // 项目归属省份
+    @Property("province")
+    private String province;
+    // 项目归属城市
+    @Property("city")
+    private String city;
+    // 项目归属区县
+    @Property("district")
+    private String district;
+    // 项目归属行业分类(小类)
+    @Property("industry")
+    private String industry;
+    // 项目归属行业分类(大类)
+    @Property("info_type")
+    private String infoType;
+    // 文章UUID
+    @Property("document_id")
+    private String documentId;
+    // 文章标题(源标题)
+    @Property("document_title")
+    private String documentTitle;
+    // 招标人
+    @Property("tenderee")
+    private String tenderee;
+    // 招标人地址
+    @Property("tenderee_addr")
+    private String tendereeAddr;
+    // 招标人电话
+    @Property("tenderee_phone")
+    private String tendereePhone;
+    // 招标联系人
+    @Property("tenderee_contact")
+    private String tendereeContact;
+    // 代理机构
+    @Property("agency")
+    private String agency;
+    // 代理机构电话
+    @Property("agency_phone")
+    private String agencyPhone;
+    // 代理联系人
+    @Property("agency_contact")
+    private String agencyContact;
+    // 项目子名称
+    @Property("sub_project_name")
+    private String subProjectName;
+    // 项目子编号
+    @Property("sub_project_code")
+    private String subProjectCode;
+    // 中标人
+    @Property("win_tenderer")
+    private String winTenderer;
+    // 中标价
+    @Property("win_bid_price")
+    private String winBidPrice;
+    // 中标人联系人
+    @Property("win_tenderer_manager")
+    private String winTendererManager;
+    // 中标人联系人电话
+    @Property("win_tenderer_phone")
+    private String winTendererPhone;
+    // 第二中标人
+    @Property("second_tenderer")
+    private String secondTenderer;
+    // 第二中标价
+    @Property("second_bid_price")
+    private String secondBidPrice;
+    // 第二中标人联系人
+    @Property("second_tenderer_manager")
+    private String secondTendererManager;
+    // 第二中标人联系人
+    @Property("second_tenderer_phone")
+    private String secondTendererPhone;
+    // 第三中标人
+    @Property("third_tenderer")
+    private String thirdTenderer;
+    // 第三中标价
+    @Property("third_bid_price")
+    private String thirdBidPrice;
+    // 第三中标人联系人
+    @Property("third_tenderer_manager")
+    private String thirdTendererManager;
+    // 第三中标人联系人
+    @Property("third_tenderer_Phone")
+    private String thirdTendererPhone;
+    // 是否废标
+    @Property("is_effective")
+    private boolean isEffective;
+
+    /**
+     * 公告id
+     */
+    private String docId;
+
+    private String upgradeStatus;      // 这个属性用于旧数据的升级(null和0:待升级、-1:升级中、1:已升级)
+
+    public ObjectId getId() {
+        return id;
+    }
+
+    public void setId(ObjectId id) {
+        this.id = id;
+    }
+
+    public String getProjectName() {
+        return projectName;
+    }
+
+    public void setProjectName(String projectName) {
+        this.projectName = projectName;
+    }
+
+    public String getProjectCode() {
+        return projectCode;
+    }
+
+    public void setProjectCode(String projectCode) {
+        this.projectCode = projectCode;
+    }
+
+    public String getProjectAddr() {
+        return projectAddr;
+    }
+
+    public void setProjectAddr(String projectAddr) {
+        this.projectAddr = projectAddr;
+    }
+
+    public String getPageTime() {
+        return pageTime;
+    }
+
+    public void setPageTime(String pageTime) {
+        this.pageTime = pageTime;
+    }
+
+    public String getProvince() {
+        return province;
+    }
+
+    public void setProvince(String province) {
+        this.province = province;
+    }
+
+    public String getIndustry() {
+        return industry;
+    }
+
+    public void setIndustry(String industry) {
+        this.industry = industry;
+    }
+
+    public String getDocumentId() {
+        return documentId;
+    }
+
+    public void setDocumentId(String documentId) {
+        this.documentId = documentId;
+    }
+
+    public String getDocumentTitle() {
+        return documentTitle;
+    }
+
+    public void setDocumentTitle(String documentTitle) {
+        this.documentTitle = documentTitle;
+    }
+
+    public String getTenderee() {
+        return tenderee;
+    }
+
+    public void setTenderee(String tenderee) {
+        this.tenderee = tenderee;
+    }
+
+    public String getTendereeAddr() {
+        return tendereeAddr;
+    }
+
+    public void setTendereeAddr(String tendereeAddr) {
+        this.tendereeAddr = tendereeAddr;
+    }
+
+    public String getTendereePhone() {
+        return tendereePhone;
+    }
+
+    public void setTendereePhone(String tendereePhone) {
+        this.tendereePhone = tendereePhone;
+    }
+
+    public String getTendereeContact() {
+        return tendereeContact;
+    }
+
+    public void setTendereeContact(String tendereeContact) {
+        this.tendereeContact = tendereeContact;
+    }
+
+    public String getAgency() {
+        return agency;
+    }
+
+    public void setAgency(String agency) {
+        this.agency = agency;
+    }
+
+    public String getAgencyPhone() {
+        return agencyPhone;
+    }
+
+    public void setAgencyPhone(String agencyPhone) {
+        this.agencyPhone = agencyPhone;
+    }
+
+    public String getAgencyContact() {
+        return agencyContact;
+    }
+
+    public void setAgencyContact(String agencyContact) {
+        this.agencyContact = agencyContact;
+    }
+
+    public String getSubProjectName() {
+        return subProjectName;
+    }
+
+    public void setSubProjectName(String subProjectName) {
+        this.subProjectName = subProjectName;
+    }
+
+    public String getSubProjectCode() {
+        return subProjectCode;
+    }
+
+    public void setSubProjectCode(String subProjectCode) {
+        this.subProjectCode = subProjectCode;
+    }
+
+    public String getWinTenderer() {
+        return winTenderer;
+    }
+
+    public void setWinTenderer(String winTenderer) {
+        this.winTenderer = winTenderer;
+    }
+
+    public String getWinBidPrice() {
+        return winBidPrice;
+    }
+
+    public void setWinBidPrice(String winBidPrice) {
+        this.winBidPrice = winBidPrice;
+    }
+
+    public String getWinTendererManager() {
+        return winTendererManager;
+    }
+
+    public void setWinTendererManager(String winTendererManager) {
+        this.winTendererManager = winTendererManager;
+    }
+
+    public String getSecondTenderer() {
+        return secondTenderer;
+    }
+
+    public void setSecondTenderer(String secondTenderer) {
+        this.secondTenderer = secondTenderer;
+    }
+
+    public String getSecondBidPrice() {
+        return secondBidPrice;
+    }
+
+    public void setSecondBidPrice(String secondBidPrice) {
+        this.secondBidPrice = secondBidPrice;
+    }
+
+    public String getSecondTendererManager() {
+        return secondTendererManager;
+    }
+
+    public void setSecondTendererManager(String secondTendererManager) {
+        this.secondTendererManager = secondTendererManager;
+    }
+
+    public String getThirdTenderer() {
+        return thirdTenderer;
+    }
+
+    public void setThirdTenderer(String thirdTenderer) {
+        this.thirdTenderer = thirdTenderer;
+    }
+
+    public String getThirdBidPrice() {
+        return thirdBidPrice;
+    }
+
+    public void setThirdBidPrice(String thirdBidPrice) {
+        this.thirdBidPrice = thirdBidPrice;
+    }
+
+    public String getThirdTendererManager() {
+        return thirdTendererManager;
+    }
+
+    public void setThirdTendererManager(String thirdTendererManager) {
+        this.thirdTendererManager = thirdTendererManager;
+    }
+
+    public void setIsEffective(boolean isEffective) {
+        this.isEffective = isEffective;
+    }
+
+    public String getInfoType() {
+        return infoType;
+    }
+
+    public void setInfoType(String infoType) {
+        this.infoType = infoType;
+    }
+
+    public String getArea() {
+        return area;
+    }
+
+    public void setArea(String area) {
+        this.area = area;
+    }
+
+    public String getCity() {
+        return city;
+    }
+
+    public void setCity(String city) {
+        this.city = city;
+    }
+
+    public String getDistrict() {
+        return district;
+    }
+
+    public void setDistrict(String district) {
+        this.district = district;
+    }
+
+    public String getWinTendererPhone() {
+        return winTendererPhone;
+    }
+
+    public void setWinTendererPhone(String winTendererPhone) {
+        this.winTendererPhone = winTendererPhone;
+    }
+
+    public String getSecondTendererPhone() {
+        return secondTendererPhone;
+    }
+
+    public void setSecondTendererPhone(String secondTendererPhone) {
+        this.secondTendererPhone = secondTendererPhone;
+    }
+
+    public String getThirdTendererPhone() {
+        return thirdTendererPhone;
+    }
+
+    public void setThirdTendererPhone(String thirdTendererPhone) {
+        this.thirdTendererPhone = thirdTendererPhone;
+    }
+
+    public String getDocId() {
+        return docId;
+    }
+
+    public void setDocId(String docId) {
+        this.docId = docId;
+    }
+
+    public String getUpgradeStatus() {
+        return upgradeStatus;
+    }
+
+    public void setUpgradeStatus(String upgradeStatus) {
+        this.upgradeStatus = upgradeStatus;
+    }
+
+    public void addAll(ZhongBiaoExtraction zhongBiaoExtraction)  {
+        try {
+            Field[] newFields = zhongBiaoExtraction.getClass().getDeclaredFields();
+            Field[] thisFields = this.getClass().getDeclaredFields();
+            for (Field newField : newFields) {
+                String newFieldName = newField.getName();
+
+                if (newFieldName.equals("id")) {
+                    continue;
+                }
+
+                for (Field thisField : thisFields) {
+                    String thisFieldName = thisField.getName();
+                    if (thisFieldName.equals(newFieldName)) {
+                        thisField.setAccessible(true);
+                        newField.setAccessible(true);
+                        if (newField.get(zhongBiaoExtraction) != null) {
+                            Method thisMethod = this.getClass().getMethod("set" + thisFieldName.substring(0, 1).toUpperCase() + thisFieldName.substring(1), newField.getType());
+                            thisMethod.invoke(this, newField.get(zhongBiaoExtraction));
+                        }
+                        break;
+                    }
+                }
+            }
+        } catch(Exception e) {
+            e.printStackTrace();
+        }
+    }
+}

+ 0 - 0
comment/__init__.py


+ 0 - 0
dataSource/__init__.py


+ 28 - 0
dataSource/pool.py

@@ -0,0 +1,28 @@
+
+from multiprocessing import RLock
+import queue
+
+
+class ConnectorPool():
+
+    def __init__(self,init_num,max_num,method_init,**kwargs):
+        self.connector_pool = queue.Queue()
+        for i in range(init_num):
+            self.connector_pool.put(method_init(**kwargs))
+        self.method_init = method_init
+        self.kwargs = kwargs
+        self._lock = RLock()
+        self.pool_size = init_num
+        self.max_num = max_num
+
+    def getConnector(self):
+        with self._lock:
+            if self.connector_pool.empty():
+                if self.pool_size<self.max_num:
+                    self.connector_pool.put(self.method_init(**self.kwargs))
+            _conn = self.connector_pool.get(block=True)
+            return _conn
+
+    def putConnector(self,_conn):
+        with self._lock:
+            self.connector_pool.put(_conn)

+ 42 - 0
dataSource/setttings.py

@@ -0,0 +1,42 @@
+# encoding: utf-8
+solr_collections = {"document":"http://47.97.221.63:8983/solr/",#文档
+                    "company":"http://47.97.210.202:8983/solr/",#公司
+                    "contact":"http://47.97.210.202:8983/solr/",#联系人
+                    "designed_project":"http://47.97.210.202:8983/solr/",
+                    "exclusive_project":"http://47.97.210.202:8983/solr/",
+                    "keyword_dict":"http://47.97.210.202:8983/solr/",
+                    "shen_pi_xiang_mu":"http://47.97.210.202:8983/solr/",#审批项目
+                    "t_company_qualification":"http://47.97.210.202:8983/solr/",
+                    "t_registrant":"http://47.97.210.202:8983/solr/"}
+
+mysql_host = "rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com"
+mysql_port = 3306
+mysql_user = "bxkc_read"
+mysql_pass = "bxkc_20RE18AD"
+mysql_db = "bxkc"
+
+test_mysql_host = "192.168.2.170"
+test_mysql_port = 3306
+test_mysql_user = "root"
+test_mysql_pass = "pwdformysql0922"
+test_mysql_db = "exportDB"
+
+mongo_host = "47.98.60.3"
+mongo_port = 17017
+mongo_db = "bxkc"
+mongo_user = "bxkc_read"
+mongo_pass = "BidiReadOnly2017"
+
+elasticSearch_url = "http://47.97.210.202:9200/_search"
+
+# neo4j_host = "47.98.60.3"
+neo4j_host = "118.31.10.60"
+neo4j_port = 7687
+neo4j_user = "bxkc_web"
+neo4j_pass = "bxkc_web"
+
+oracle_host = "121.46.18.113"
+oracle_port = 10522
+oracle_user = "bxkc_data_readonly"
+oracle_pass = "P7WUrgcz0@#j8pjg"
+oracle_db = "yanphone"

+ 94 - 0
dataSource/source.py

@@ -0,0 +1,94 @@
+#encoding:UTF8
+
+from dataSource.setttings import *
+import requests
+import json
+import pymysql
+import pymongo
+from py2neo import Graph,NodeMatcher
+import tablestore
+
+
+
+def solrQuery(collection,args):
+    if collection in solr_collections:
+        _arg = ""
+        for k,v in args.items():
+            _arg += "&%s=%s"%(k,v)
+        _arg = _arg[1:]
+        url = "%s%s/select?%s"%(solr_collections[collection],collection,_arg)
+        resp = requests.get(url)
+        if resp.status_code==200:
+            return json.loads(resp.content.decode())
+        return None
+
+def solrQuery_url(url):
+    resp = requests.get(url)
+    if resp.status_code==200:
+        return json.loads(resp.content.decode())
+    return None
+
+def getConnection_mysql(db=None):
+    if db is None:
+        db = mysql_db
+    connect = pymysql.Connect(host=mysql_host, port=mysql_port, db=db, user=mysql_user, passwd=mysql_pass)
+    return connect
+
+def getConnection_testmysql(db=None):
+    if db is None:
+        db = test_mysql_db
+    connect = pymysql.Connect(host=test_mysql_host, port=test_mysql_port, db=db, user=test_mysql_user, passwd=test_mysql_pass)
+    return connect
+
+def getConnection_oracle():
+    import cx_Oracle
+    connect = cx_Oracle.connect(oracle_user,oracle_pass,'%s:%s/%s'%(oracle_host,oracle_port,oracle_db), encoding = "UTF-8", nencoding = "UTF-8")
+    # connect = cx_Oracle.connect('%s/%s@%s:%s/%s'%(oracle_user,oracle_pass,oracle_host,oracle_port,oracle_db))
+    return connect
+
+def getConnect_mongodb():
+    client = pymongo.MongoClient(mongo_host,mongo_port)
+    db = client[mongo_db]
+    db.authenticate(mongo_user,mongo_pass)
+    return db
+
+def make_elasticSearch(query):
+    resp = requests.post(elasticSearch_url,json=query)
+    if resp.status_code==200:
+        return json.loads(resp.content.decode())
+    return None
+
+def getConnect_neo4j():
+    graph = Graph(host=neo4j_host,auth=(neo4j_user,neo4j_pass))
+    return graph
+    # finded = graph.run("MATCH (n:Organization)-[R:ZhaoBiaoRelation]->(p:Project) where n.name='昆山市周市基础建设开发有限公司的昆山市恒迪服装辅料公司'  RETURN p LIMIT 25")
+    # print(json.loads(json.dumps(finded.data())))
+    # print(finded)
+
+def getConnect_ots():
+    ots_client = tablestore.client.OTSClient('https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com', 'LTAI4FyUT7ZcQFZPjVtw5y9b', '2zscfFTvy3JWavtCeCOthLxF8bDNH3',
+                                             'bxkc-ots', logger_name = 'table_store.log',
+                                             retry_policy = tablestore.WriteRetryPolicy())
+    return ots_client
+
+def getConnect_gdb():
+    from gremlin_python.driver import client
+    client = client.Client('ws://gds-bp130d7rgd9m7n61150070pub.graphdb.rds.aliyuncs.com:3734/gremlin', 'g', username="bxkc", password="k0n1bxkc!0K^Em%j")
+    callback = client.submitAsync("g.V('北京赛克德利科贸有限公司').outE('ZhongBiaoRelation').inV().inE('ZhaoBiaoRelation').outV()")
+    for result in callback.result():
+        for item in result:
+            print(item.id)
+    return client
+
+
+if __name__=="__main__":
+    # solrQuery("document",{"q":"*:*"})
+    # getConnect_mongodb()
+    # data = solrQuery_url('http://47.97.221.63:8983/solr/document/select?fq=(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])&q=dochtmlcon:"防盗门"')
+    # data = solrQuery("document",{"q":'dochtmlcon:"防盗门"',"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
+    # data = make_elasticSearch({"query":{"bool":{"must":[{"wildcard":{"nicknames.keyword":"*服装*"}}],"must_not":[],"should":[]}},"from":0,"size":10,"sort":[],"aggs":{}})
+    # print(data)
+    # getConnect_neo4j()
+    conn = getConnection_oracle()
+    # cursor = conn.cursor()
+    # getConnect_gdb()

+ 24 - 0
dataSource/账号.txt

@@ -0,0 +1,24 @@
+检索系统: http://47.97.221.63:8983/solr/#/document/query
+			http://47.97.210.202:8983/solr/#/designed_project/query
+Mongodb: host:121.46.18.113	port:17017    账号:bxkc_read	 密码:BidiReadOnly2017  product:47.98.60.3:17017
+http://47.98.60.3:7474/browser/ Neo4j: bolt://47.98.60.3/:7687  账号:bxkc_web	 密码:bxkc_web
+rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com bxkc_read bxkc_20RE18AD
+elasticSearch http://47.97.210.202:9100/
+
+千里马
+A15013122808 biaoxun666@
+剑鱼
+14748340652	biaoxun666
+14748252604	biaoxun666
+13418079164	biaoxun666
+14748450674	biaoxun666
+14748342971	biaoxun666
+14745297952	biaoxun666
+14748311820	biaoxun666
+14748317104	biaoxun666
+14743601752	biaoxun666
+14743440732	14743440732
+中国导航网
+a15711848144 bidi8888@
+中国招标网
+xinchen2020 bidi8888@

+ 0 - 0
export/__init__.py


+ 2308 - 0
export/exportDocument.py

@@ -0,0 +1,2308 @@
+#encoding:GBK
+import sys
+import os
+sys.path.append("../")
+
+import pandas as pd
+from dataSource.source import *
+import json
+from utils.multiThread import MultiThreadHandler
+import queue
+from utils.Utils import *
+from dataSource.pool import ConnectorPool
+import re
+from tablestore import *
+import traceback
+from utils.hashUtil import aesCipher
+from export.exportEnterprice import getDictEnterprise,getOneContact
+
+
+data_path = "../data/"
+
+def getCompanyTenderer():
+
+    def _handle(item,result_queue):
+        company = item
+        dict_result = {"company":company,"count":0,"competitor":"","project_name":""}
+        dict_result["company"] = company
+        graph = getConnect_neo4j()
+        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhao_biao_id,p.zhong_biao_id"%(company)
+        finded = graph.run(cql)
+        finded_ids = json.loads(json.dumps(finded.data()))
+        dict_result["count"] = len(finded_ids)
+        mongoDB = getConnect_mongodb()
+        coll_zb = mongoDB.zhongbiao_extraction
+        if len(finded_ids)>0:
+            cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.project_name limit 3"%(company)
+            finded = graph.run(cql)
+            finded_names = json.loads(json.dumps(finded.data()))
+            list_names = [_i["p.project_name"] for _i in finded_names]
+            dict_result["project_name"] = str(list_names)
+
+            cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company)
+            finded = graph.run(cql)
+            finded_money = json.loads(json.dumps(finded.data()))
+            whole_money = 0
+            for item in finded_money:
+                if item["r.price"] is not None:
+                    whole_money += getUnifyMoney(item["r.price"])
+            dict_result["whole_money"] = str(whole_money)
+            dict_competitor = {}
+            for item in finded_ids:
+                docId = item["p.zhong_biao_id"]
+                if docId is not None:
+                    rows = coll_zb.find({"docId":docId})
+                    for row in rows:
+                        keys = ["second_tenderer","third_tenderer"]
+                        for _key in keys:
+                            if _key in row:
+                                if row[_key] not in dict_competitor:
+                                    dict_competitor[row[_key]] = 0
+                                dict_competitor[row[_key]] += 1
+            list_competitor = []
+            for _key in dict_competitor:
+                list_competitor.append([_key,dict_competitor[_key]])
+            list_competitor.sort(key=lambda x:x[1],reverse=True)
+            list_competitors = [i[0] for i in list_competitor[:10]]
+            dict_result["competitor"] = str(list_competitors)
+        result_queue.put(dict_result)
+
+
+
+
+    # filename = "成交客户匹配中标项目的需求.xlsx"
+    # df = pd.read_excel(filename)
+    # list_company = df["公司名字"]
+    # company = list_company[0]
+    list_company = []
+    filename = "../data/服务型客户.txt"
+    with open(filename,"r",encoding="GBK") as f:
+        while(True):
+            line = f.readline()
+            if not line:
+                break
+            list_company.append(line.strip())
+    task_queue = queue.Queue()
+    for company in list_company:
+        task_queue.put(company)
+    result_queue = queue.Queue()
+    handler = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=10)
+    handler.run()
+    list_company = []
+    list_zb = []
+    list_count = []
+    list_project = []
+    list_money = []
+    list_competitor = []
+    while(True):
+
+        try:
+            _result = result_queue.get(False)
+            list_company.append(_result.get("company",""))
+            list_zb.append("是" if _result.get("count","")>0 else "否")
+            list_count.append(_result.get("count",""))
+            list_project.append(_result.get("project_name",""))
+            list_money.append(_result.get("whole_money",""))
+            list_competitor.append(_result.get("competitor",""))
+        except Exception as e:
+            print(e)
+            break
+    df1 = pd.DataFrame({"公司名字":list_company,"是否中标":list_zb,"中标次数":list_count,"中标项目":list_project,"中标金额":list_money,"潜在竞争对手":list_competitor})
+    df1.to_excel("%s_export.xls"%(filename),columns=["公司名字","是否中标","中标次数","中标项目","中标金额","潜在竞争对手"])
+
+def export_count_includeKeyword():
+    filename = "../data/jc001.xlsx"
+    list_name = []
+    list_count = []
+    df = pd.read_excel(filename)
+    _index = 0
+    for row in df["品目"]:
+        _name = row
+        data = solrQuery("document",{"q":'dochtmlcon:"%s"'%_name,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
+        if data is not None:
+            _count = data["response"]["numFound"]
+        else:
+            _count = 0
+        list_name.append(_name)
+        list_count.append(_count)
+        _index += 1
+        print(_index)
+    df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
+    df1.to_excel("%s_export.xls"%filename)
+
+def export_count_includeKeyword_multiThread():
+
+    def _handler(item,result_queue):
+        data = solrQuery("document",{"q":'dochtmlcon:"%s"'%item,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
+        if data is not None:
+            _count = data["response"]["numFound"]
+        else:
+            _count = 0
+        result_queue.put([item,_count])
+
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    filename = "../data/jc001.xlsx"
+    list_name = []
+    list_count = []
+    df = pd.read_excel(filename)
+    _index = 0
+    for row in df["品目"]:
+        _name = row
+        task_queue.put(_name)
+        _index += 1
+    multHandler = MultiThreadHandler(task_queue,_handler,result_queue,thread_count=20)
+    multHandler.run()
+    while(True):
+        try:
+            item = result_queue.get(False)
+            list_name.append(item[0])
+            list_count.append(item[1])
+        except queue.Empty as e:
+            break
+    df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
+    df1.to_excel("%s_export.xls"%filename)
+
+def exportKeywords():
+    def _handle(item,result_queue,pool_mongo):
+        docId = item["docId"]
+        mongo = pool_mongo.getConnector()
+        zhongbiao = mongo.zhongbiao_extraction
+        zhaobiao = mongo.zhaobiao_extraction
+        _project = ""
+        rows = zhaobiao.find({"docId":docId},{"project_name":1})
+        find_flag = False
+        for row in rows:
+            find_flag = True
+            _project = row.get("project_name","")
+        if not find_flag:
+            rows = zhongbiao.find({"docId":docId},{"project_name":1})
+            for row in rows:
+                _project = row.get("project_name","")
+        item["project_name"] = _project
+        pool_mongo.putConnector(mongo)
+        result_queue.put(item)
+    list_key = []
+    dict_key_ids = dict()
+    with open("../data/品目.txt", "r", encoding="utf8") as f:
+        while(True):
+            row = f.readline()
+            if not row:
+                break
+            list_key.append(row)
+            dict_key_ids[row] = []
+            data = solrQuery("document",{"q":'dochtmlcon:"%s" AND dochtmlcon:"法院"'%row,"fq":'(publishtime:[2019-01-01T00:00:00Z TO 2019-12-31T23:59:59Z])',"fl":"id","rows":10000000})
+            for item in data["response"]["docs"]:
+                dict_key_ids[row].append(item["id"])
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for _key in dict_key_ids.keys():
+        for item in dict_key_ids[_key]:
+            task_queue.put({"docId":item,"project_name":""})
+    pool_mongo = ConnectorPool(init_num=10,max_num=200,method_init=getConnect_mongodb)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=200,pool_mongo=pool_mongo)
+    mt.run()
+    dict_docId_projectname = {}
+    while(True):
+        try:
+            item = result_queue.get(False)
+            dict_docId_projectname[item["docId"]] = item["project_name"]
+        except Exception:
+            break
+    dict_key_count = dict()
+    for _key in dict_key_ids.keys():
+        set_key = set()
+        for docId in dict_key_ids[_key]:
+            set_key.add(dict_docId_projectname.get(docId,""))
+        dict_key_count[_key] = len(set_key)
+    print("==")
+    for _key in list_key:
+        print(len(dict_key_ids[_key]))
+    print("==")
+    for _key in list_key:
+        print(dict_key_count[_key])
+    print("==")
+
+def getIndustryCompany():
+    def _handle(item,result_queue,pool_mongo,pool_neo4j,pool_mysql,pool_ots):
+        # mongoDB = getConnect_mongodb()
+        log(item["enterprise_name"])
+        mongoDB = pool_mongo.getConnector()
+        # coll_zb = mongoDB.enterprise_profile
+        # rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1,"legalPersonName":1,"actualCapital":1, "regCapital":1,"estiblishTime":1,"socialStaffNum":1,"legal_person":1,"phone":1,"businessScope":1,"industry":1 })
+        # for row in rows:
+        #     item["regCapital"] = row.get("regCapital","")
+        #     item["legal_person"] = row.get("legal_person","")
+        #     item["phone"] = row.get("phone","")
+        #     item["actualCapital"] = row.get("actualCapital","")
+        #     item["industry"] = row.get("industry","")
+        #     item["estiblishTime"] = row.get("estiblishTime","")
+        #     item["socialStaffNum"] = row.get("socialStaffNum","")
+        #     item["businessScope"] = row.get("businessScope","")
+        # graph = getConnect_neo4j()
+
+        ots_client = pool_ots.getConnector()
+
+        primary_key = [('name',item["enterprise_name"])]
+
+        columns_to_get = ["reg_capital","legal_person","phone","actual_capital","industry","estiblishTime","social_staff_num","business_scope"]
+
+        consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
+
+        if return_row is not None:
+            for att in return_row.attribute_columns:
+                item[att[0]] = att[1]
+
+            list_same_industry_company = []
+
+            if "industry" in item:
+                bool_query = BoolQuery(must_queries=[TermQuery("industry",item["industry"])])
+
+                col = ColumnsToGet(['enterprise_name'], ColumnReturnType.SPECIFIED)
+                rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+                                                                                  SearchQuery(bool_query, limit=10, get_total_count=True),
+                                                                                  col)
+
+                for row in rows:
+                    for item1 in row[0]:
+                        list_same_industry_company.append(item1[1])
+
+
+            # if "industry" in item:
+            #     rows = coll_zb.find({"industry":item["industry"]},{"enterprise_name":1}).limit(10)
+            #     for row in rows:
+            #         print(row)
+            #         list_same_industry_company.append(row.get("enterprise_name",""))
+            item["same_industry_company"] = list_same_industry_company
+
+        graph = pool_neo4j.getConnector()
+        company_name = item["enterprise_name"]
+        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
+        finded = graph.run(cql)
+        data = json.loads(json.dumps(finded.data()))
+        _count = data[0]["_c"]
+        # list_project = []
+        # for _data in data:
+        #     if _count<=3:
+        #         if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
+        #             if _data["project_name"] is not None:
+        #                 list_project.append(_data["project_name"])
+        #     _count += 1
+        item["count"] = _count
+        # item["project"] = str(list_project)
+        result_queue.put(item)
+        pool_mongo.putConnector(mongoDB)
+        pool_neo4j.putConnector(graph)
+        pool_ots.putConnector(ots_client)
+
+    log_tofile("export.log")
+    pool_mongo = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_mongodb)
+    pool_neo4j = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_neo4j)
+    pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql)
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+    # list_company = getCompanys()
+    # filename = "".join(["环境","生态","再生","回收","环保"])
+    list_company = []
+    filename = "../data/同行客户匹配.xlsx"
+    df = pd.read_excel(filename,sheetname=0)
+    for _com in df["公司名称"]:
+        print(_com)
+        if _com is not None and _com.strip()!="":
+            _company = {"enterprise_name":""}
+            _company["enterprise_name"] = _com
+            list_company.append(_company)
+    task_queue = queue.Queue()
+    for item in list_company:
+        task_queue.put(item)
+    result_queue = queue.Queue()
+    _muti = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j,pool_mysql=pool_mysql,pool_ots=pool_ots)
+    _muti.run()
+    df_company = {}
+    set_key = set()
+    if len(list_company)>0:
+        for item in list_company:
+            for _key in item.keys():
+                set_key.add(_key)
+                if _key not in df_company:
+                    df_company[_key] = []
+    list_key = list(set_key)
+    for item in list_company:
+        for _key in list_key:
+            df_company[_key].append(item.get(_key,""))
+    df1 = pd.DataFrame(df_company)
+
+    df1.to_excel("%s_export.xlsx"%(filename))
+
+
+
+def exportWin_tenderer(time_from,time_to):
+    '''
+    :return:
+    '''
+    ost_client = getConnect_ots()
+    last_docid = 0
+    bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
+                                         TermQuery("docchannel",101),
+                                         RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
+                                         RangeQuery('docid', last_docid, include_lower=False)])
+
+    rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
+                                                                      SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
+                                                                      ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
+
+    list_project = []
+    def _getRow(list_project,rows,last_docid):
+        for row in rows:
+            project_name = row[1][0][1]
+            docid = row[0][1][1]
+            last_docid = docid
+            list_pack = json.loads(row[1][1][1])
+            _set_tenderer = set()
+            win_tenderer = ""
+            for _pack in list_pack:
+                if "win_tenderer" in _pack and win_tenderer=="":
+                    win_tenderer = _pack["win_tenderer"]
+                if "second_tenderer" in _pack:
+                    _set_tenderer.add(_pack["second_tenderer"])
+                if "third_tenderer" in _pack:
+                    _set_tenderer.add(_pack["third_tenderer"])
+            list_project.append({"docid":docid,"project_name":project_name,"win_tenderer":win_tenderer,"tenderer":list(_set_tenderer)})
+        return last_docid
+    _getRow(list_project,rows,last_docid)
+    while(next_token):
+        print("%d/%d"%(len(list_project),total_count))
+        rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
+                                                                          ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
+        last_docid = _getRow(list_project,rows,last_docid)
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for item in list_project:
+        task_queue.put(item)
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+    def _handle(item,result_queue,pool_ots):
+        if item["win_tenderer"]!="":
+            ots_client = pool_ots.getConnector()
+            consumed, return_row, next_token = ost_client.get_row("enterprise", [("name",item["win_tenderer"])], ["province","reg_capital","estiblish_time","business_scope"], None, 1)
+            _dict = dict()
+            for _item in return_row.attribute_columns:
+                _dict[_item[0]] = _item[1]
+            for _key in _dict.keys():
+                item[_key] = _dict[_key]
+            data = solrQuery("contact",{"q":'company_name:"%s"'%item["win_tenderer"],"fl":"contact_person,mobile_no,phone_no","rows":10})
+            for _item in data["response"]["docs"]:
+                for _key in _item.keys():
+                    item[_key] = _item[_key]
+                break
+            pool_ots.putConnector(ots_client)
+
+
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots);
+    mt.run()
+    keys = ["docid","project_name","win_tenderer","tenderer","province","reg_capital","business_scope","estiblish_time","contact_person","mobile_no","phone_no"]
+    df_data = {}
+    for _key in keys:
+        df_data[_key] = []
+    for item in list_project:
+        for _key in keys:
+            if _key in item:
+                df_data[_key].append(item[_key])
+            else:
+                df_data[_key].append("")
+    df = pd.DataFrame(df_data)
+    df.to_excel("../data/%s-%s中标信息.xlsx"%(time_from,time_to),columns=keys)
+
+def exportContact():
+    time_from = "2021-01-14"
+    time_to = "2021-01-15"
+    filename = "../data/%s-%s中标信息.xlsx"%(time_from,time_to)
+    df1 = pd.read_excel(filename)
+    set_company = set()
+    for item in df1["tenderer"]:
+        list_company = re.split("\['|', '|'\]|\[\]",item)
+        for _company in list_company:
+            if _company!="":
+                set_company.add(_company)
+
+    companys = list(set_company)
+    task_queue = queue.Queue()
+    list_company = []
+    for _company in companys:
+        item = {"company_name":_company}
+        list_company.append(item)
+        task_queue.put(item)
+    result_queue = queue.Queue()
+    def _handle(item,result_queue):
+        company = item["company_name"]
+        data = solrQuery("contact",{"q":'company_name:"%s"'%company,"fl":"company_name,contact_person,mobile_no,phone_no","rows":10})
+        for _item in data["response"]["docs"]:
+            for _key in _item.keys():
+                item[_key] = _item[_key]
+            break
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30);
+    mt.run()
+    keys = ["company_name","contact_person","mobile_no","phone_no"]
+    df_data = {}
+    for _key in keys:
+        df_data[_key] = []
+    ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
+    for item in list_company:
+        for _key in keys:
+            if _key in item:
+                df_data[_key].append(ILLEGAL_CHARACTERS_RE.sub(r'', item[_key]))
+            else:
+                df_data[_key].append("")
+    df = pd.DataFrame(df_data)
+    df.to_excel("../data/%s-%s竞争对手信息.xlsx"%(time_from,time_to),columns=keys)
+
+def countKeyword():
+    conn = getConnection_mysql()
+    cursor = conn.cursor()
+    print(0)
+    sql = "select dochtmlcon from sys_document_22 where docchannel=52 and page_time>='2020-09-01' and page_time<='2020-09-07'"
+    cursor.execute(sql)
+    print(0.1)
+    df = pd.read_excel("万郡绿建细分关键词.xls")
+    list_keywords = df["细分类别"]
+    dict_keywords = dict()
+    for _key in list_keywords:
+        dict_keywords[_key] = 0
+    print(1)
+    from bs4 import BeautifulSoup
+    while(True):
+        rows = cursor.fetchmany(10000)
+        print("==")
+        if not rows:
+            break
+        for row in rows:
+            _html = BeautifulSoup(row[0],"lxml").getText()
+            for _key in list_keywords:
+                if re.search(_key,_html) is not None:
+                    dict_keywords[_key] += 1
+    print(dict_keywords)
+    list_count = []
+    for _key in list_keywords:
+        list_count.append(dict_keywords[_key])
+    df1 = pd.DataFrame({"关键字":list_keywords,"数量":list_count})
+    df1.to_excel("关键词统计.xlsx")
+
+def countKeyword_solr():
+    def _handle(item,result_queue):
+        keyword = item["keyword"]
+        data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:101 AND dochtmlcon:"法院" '%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
+        _num = data["response"]["numFound"]
+        item["zhongbiao"] = _num
+        data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:52 AND dochtmlcon:"法院"'%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
+        _num = data["response"]["numFound"]
+        item["zhaobiao"] = _num
+        result_queue.put(item)
+    file = "../data/关键词11.xlsx"
+    df = pd.read_excel(file)
+    task_queue = queue.Queue()
+    print(df.keys())
+    for item in df["业务关键词"]:
+        task_queue.put({"keyword":item,"zhaobiao":0,"zhongbiao":0})
+    result_queue = queue.Queue()
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=10)
+    mt.run()
+    list_keyword = []
+    list_zhaobiao = []
+    list_zhongbiao = []
+    while(True):
+        try:
+            item = result_queue.get(False)
+            list_keyword.append(item["keyword"])
+            list_zhaobiao.append(item["zhaobiao"])
+            list_zhongbiao.append(item["zhongbiao"])
+        except Exception:
+            break
+    df1 = pd.DataFrame({"业务关键词":list_keyword,"招标公告":list_zhaobiao,"中标信息":list_zhongbiao})
+    df1.to_excel("%s_export.xlsx"%file,columns=["业务关键词","招标公告","中标信息"])
+
+def query_from_solr():
+    data = solrQuery("document",{"q":'dochtmlcon:"法律" AND (docchannel:51 OR docchannel:104 or docchannel:52 or docchannel:102) AND province:"湖南" ',"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-01-20T23:59:59Z])',"fl":"id","rows":10})
+    _num = data["response"]["numFound"]
+    print(_num)
+
+def export_province_keyword_count():
+    def _handle(item,result_queue,pool_ots):
+        columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
+        ots_client = pool_ots.getConnector()
+        _province = item["province"]
+        print(item)
+        # keywords = item["keyword"]+" "+item["key"]
+        list_keyword = item["keyword"]
+        # for _temp in keywords.split(" "):
+        #     if len(_temp)>0:
+        #         list_keyword.append(_temp)
+        should_queries = []
+        must_not_q = []
+        for _temp in list_keyword:
+            should_queries.append(MatchPhraseQuery("doctitle","%s"%_temp))
+            must_not_q.append(WildcardQuery("tenderee","*%s*"%_temp))
+
+
+        bool_query_keyword = BoolQuery(should_queries=should_queries,minimum_should_match=2)
+        page_time = item["page_time"]
+        bool_query = BoolQuery(must_queries=[bool_query_keyword
+                                             # ,WildcardQuery("publishtime","%s*"%page_time)
+                                             # ,MatchPhraseQuery("doctitle","服务")
+                                             ,RangeQuery("page_time","2021-04-22","2021-04-27",include_lower=True,include_upper=True),
+                                             TermQuery("docchannel",101),
+                                             RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
+                                             WildcardQuery('province', '%s*'%_province)
+                                             ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_tenderer",0,include_lower=True))
+                                             ],
+                               must_not_queries=must_not_q)
+
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("sub_docs_json.bidding_budget",SortOrder.DESC)]), limit=20, get_total_count=True),
+                                                                          ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED))
+
+        item["count"] = total_count
+        list_data = getRow_ots(rows)
+
+        item["list_data"] = list_data
+        print(item)
+
+        pool_ots.putConnector(ots_client)
+
+    df = pd.read_excel("../data/省份信息.xlsx")
+    list_province = []
+    for _name,_type in zip(df["cname"],df["ctype"]):
+        if _type==20:
+            list_province.append(_name)
+
+    # filename = "../data/2021-02关键词导出数据.xlsx"
+    # dict_keyword = {}
+    # df1 = pd.read_excel(filename,dtype=str)
+    # for _key,_keyword in zip(df1["key1"],df1["keyword"]):
+    #     print("===",str(_keyword))
+    #     dict_keyword[_key] = "" if str(_keyword)=="nan" else _keyword
+    # for _key in df1["关键词"]:
+    #     dict_keyword[_key] = ""
+    keyword_str = '''
+    快递	物流	供应链	运输	配送
+仓储	冷链	整车 服务
+    '''
+    list_key = []
+    for _k in re.split("\s",keyword_str):
+        _k1 = _k.strip()
+        if len(_k1)>0:
+            list_key.append(_k1)
+
+
+    list_task = []
+    page_time = "2020-11"
+    for _province in list_province:
+        list_task.append({"page_time":page_time,"province":_province,"key":list_key,"keyword":list_key,"count":0})
+
+    task_queue = queue.Queue()
+    for item in list_task:
+        task_queue.put(item)
+
+    result_queue = queue.Queue()
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
+    mt.run()
+    dict_key_data = dict()
+    list_data = []
+    for item in list_task:
+        list_data.extend(item["list_data"])
+    dict_channel = getDict_docchannel()
+    df_data= {}
+    print(list_data)
+    for row in list_data:
+        item = {}
+        _dict = row
+        set_dict_item(item,"docid",_dict.get("docid",""))
+        set_dict_item(item,"公告标题",_dict.get("doctitle",""))
+        set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
+        set_dict_item(item,"省份",_dict.get("province",""))
+        # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
+        set_dict_item(item,"城市",_dict.get("city",""))
+        set_dict_item(item,"发布时间",_dict.get("page_time",""))
+
+        set_dict_item(item,"项目编号",_dict.get("project_code",""))
+        set_dict_item(item,"招标单位",_dict.get("tenderee",""))
+        set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
+        set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
+        set_dict_item(item,"代理单位",_dict.get("agency",""))
+        set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
+        set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
+        set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
+
+        sub_docs_json = _dict.get("sub_docs_json")
+        for _doc in json.loads(sub_docs_json):
+            if "win_tenderer" in _doc:
+                set_dict_item(item,"中标单位",_doc["win_tenderer"])
+            if "win_tenderee_manager" in _doc:
+                set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
+            if "win_tenderee_phone" in _doc:
+                set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
+            if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
+                set_dict_item(item,"中标金额",_doc["win_bid_price"])
+            if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
+                set_dict_item(item,"招标金额",_doc["bidding_budget"])
+        if "招标金额" not in item:
+            set_dict_item(item,"招标金额","")
+        if "中标金额" not in item:
+            set_dict_item(item,"中标金额","")
+        if "中标单位" not in item:
+            set_dict_item(item,"中标单位","")
+        if "中标单位联系人" not in item:
+            set_dict_item(item,"中标单位联系人","")
+        if "中标单位联系电话" not in item:
+            set_dict_item(item,"中标单位联系电话","")
+
+
+        _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
+        # if _line in set_line:
+        #     continue
+        # if item["招标金额"]=="":
+        #     continue
+        # set_line.add(_line)
+        for k,v in item.items():
+            if k not in df_data:
+                df_data[k] = []
+            df_data[k].append(v)
+    df1 = pd.DataFrame(df_data)
+    df1.to_excel("../data/%s_顺丰中标数据.xlsx"%getCurrent_date('%Y-%m-%d_%H%M%S'),columns=list_df_columns)
+    # for item in list_task:
+    #     dict_key_data[item["key"]][item["province"]] = item
+    # dict_key_province = dict()
+    # dict_key_province["关键词"] = []
+    # for _province in list_province:
+    #     dict_key_province[_province] = []
+    # for _key in dict_keyword.keys():
+    #     dict_key_province["关键词"].append(_key)
+    #     for _province in list_province:
+    #         dict_key_province[_province].append(dict_key_data[_key][_province]["count"])
+    # columns = ["关键词"]
+    # columns.extend(list_province)
+    # df2 = pd.DataFrame(dict_key_province)
+    # df2.to_excel("../data/%s_导出数据.xlsx"%filename,columns=columns)
+
+def export_keyword_count():
+    def _handle(item,result_queue,pool_ots):
+        ots_client = pool_ots.getConnector()
+
+
+        bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",item["range_from"],item["range_to"]),
+                                             RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
+                                             MatchPhraseQuery(item["type"], item["keyword"])
+                                             ])
+
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query, limit=1, get_total_count=True),
+                                                                          ColumnsToGet(return_type=ColumnReturnType.ALL))
+        item["count"] = total_count
+        pool_ots.putConnector(ots_client)
+
+
+    range_from = "2019-01-01"
+    range_to = "2022-12-23"
+    _type = "doctextcon"
+
+    assert _type in ["doctitle","doctextcon"]
+    list_dict_key_count = []
+    filename = "../data/医院.xlsx"
+    df = pd.read_excel(filename)
+    for item in df["关键词"]:
+        list_dict_key_count.append({"keyword":item,"count":0,"range_from":range_from,"range_to":range_to,"type":_type})
+
+
+    task_queue = queue.Queue()
+    for item in list_dict_key_count:
+        task_queue.put(item)
+
+    result_queue = queue.Queue()
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
+    mt.run()
+    columns = ["keyword","count","range_from","range_to","type"]
+    df_data = {}
+    for _c in columns:
+        df_data[_c] = []
+    for item in list_dict_key_count:
+        for _c in columns:
+            if _c in item:
+                df_data[_c].append(item[_c])
+            else:
+                df_data[_c].append("")
+    df2 = pd.DataFrame(df_data)
+    df2.to_excel("%s_数量导出.xlsx"%filename,columns=columns)
+
+def export_keyword_title():
+    ots_client = getConnect_ots()
+    range_from = "2020-01-01"
+    range_to = "2022-12-23"
+    list_condition = [["医务室"],
+                      ["医院"],
+                      ["卫生院"],
+                      ["卫生所"],
+                      ["卫生室"],
+                      ["社区卫生服务中心"]]
+
+    list_should_query = []
+    for _c in list_condition:
+        if len(_c)==1:
+            list_should_query.append(MatchPhraseQuery("doctitle",_c[0]))
+        else:
+            _must_query = []
+            for _q in _c:
+                _must_query.append(MatchPhraseQuery("doctitle",_q))
+            list_should_query.append(BoolQuery(must_queries=_must_query))
+    keyword_query = BoolQuery(should_queries=list_should_query)
+
+    bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",range_from,range_to),
+                                         RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
+                                         keyword_query
+                                         ])
+
+    rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                      SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
+                                                                      ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
+
+    df_data = {"docid":[],"doctitle":[],"tenderee":[]}
+    def setData(df_data,rows):
+        list_dict = getRow_ots(rows)
+        for _dict in list_dict:
+            docid = _dict.get("docid","")
+            doctitle = _dict.get("doctitle","")
+            tenderee = _dict.get("tenderee","")
+
+            df_data["docid"].append(docid)
+            df_data["doctitle"].append(doctitle)
+            df_data["tenderee"].append(tenderee)
+    setData(df_data,rows)
+    _count = len(rows)
+    while next_token:
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
+                                                                          ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
+        setData(df_data,rows)
+        _count += 100
+        print(_count,total_count)
+    file_begin = 0
+    file_length = 100000
+    _count = 0
+    while file_begin<len(df_data["docid"]):
+        _dict = dict()
+        for _key,v in df_data.items():
+            _dict[_key] = v[file_begin:file_begin+file_length]
+        _count += 1
+        file_begin += file_length
+        df = pd.DataFrame(_dict)
+        df.to_csv("../data/%s-%s_tenderee_doctitle_%d.csv"%(range_from,range_to,_count))
+
+def exportArticle_by_websource():
+    # conn = getConnection_testmysql()
+    # cursor = conn.cursor()
+    # sql = "select web_source_no from web_source"
+    # cursor.execute(sql)
+    # rows = cursor.fetchmany(10)
+    # dict_websource = dict()
+    # while(rows):
+    #     for row in rows:
+    #         web_source_no = row[0]
+    #         dict_websource[web_source_no] = []
+    #     rows = cursor.fetchmany(1000)
+    #
+    # task_queue = queue.Queue()
+    # for _key in dict_websource.keys():
+    #     task_queue.put({"key":_key,"list":dict_websource[_key]})
+    #
+    # pool_ots = ConnectorPool(init_num=100,max_num=1000,method_init=getConnect_ots)
+    # result_queue = queue.Queue()
+    # def _handle(item,result_queue,pool_ots):
+    #     _key = item["key"]
+    #     print(_key)
+    #     ots_client = pool_ots.getConnector()
+    #     bool_query = BoolQuery(must_queries=[RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
+    #                                          TermQuery('web_source_no', '%s'%_key)
+    #                                          ])
+    #
+    #     is_all_succeed = False
+    #
+    #     while(not is_all_succeed):
+    #         try:
+    #             rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+    #                                                                               SearchQuery(bool_query, limit=100, get_total_count=True),
+    #                                                                               ColumnsToGet(["docid","docchannel","dochtmlcon"],return_type=ColumnReturnType.SPECIFIED))
+    #             list_zhaobiao = []
+    #             list_zhongbiao = []
+    #             for row in rows:
+    #                 _dict = dict()
+    #                 for values in row:
+    #                     for _v in values:
+    #                         _dict[_v[0]] = _v[1]
+    #                 if _dict["docchannel"]==52:
+    #                     list_zhaobiao.append(_dict)
+    #                 elif _dict["docchannel"]==101:
+    #                     list_zhongbiao.append(_dict)
+    #             item["list"].extend(list_zhaobiao[:5])
+    #             item["list"].extend(list_zhongbiao[:5])
+    #         except Exception as e:
+    #             print(str(e))
+    #
+    #     pool_ots.putConnector(ots_client)
+    #
+    # mt = MultiThreadHandler(task_queue = task_queue,task_handler=_handle,result_queue=result_queue,thread_count=100,pool_ots=pool_ots)
+    # mt.run()
+    # df_data = {"docid":[],"web_source_no":[],"docchannel":[],"dochtmlcon":[]}
+    # for k,v in dict_websource.items():
+    #     for item in v:
+    #         df_data["docid"].append(item["docid"])
+    #         df_data["web_source_no"].append(k)
+    #         df_data["docchannel"].append(item["docchannel"])
+    #         df_data["dochtmlcon"].append(item["dochtmlcon"])
+    # df = pd.DataFrame(df_data)
+    # df.to_csv("../data/websouce_doc.csv",columns=["docid","web_source_no","docchannel","dochtmlcon"],encoding="UTF8")
+
+    df = pd.read_csv("../data/websouce_doc.csv")
+    df_2000 = {"document_id":[],"document_text":[]}
+
+    begin = 80000
+    end = 100000
+    _count = 0
+    for _id,_text in zip(df["docid"][begin:end],df["dochtmlcon"][begin:end]):
+        if len(_text)>100000:
+            continue
+        df_2000["document_id"].append(_id)
+        df_2000["document_text"].append(_text)
+    df_2 = pd.DataFrame(df_2000)
+    df_2.to_csv("../data/websouce_doc_%d-%d.csv"%(begin,end),columns=["document_id","document_text"],encoding="utf8",index=False)
+    # save(dict_websource,"../data/dict_websource.pk")
+
+def getWinTenderer(sub_doc_json):
+    if sub_doc_json is not None:
+        sub_doc = json.loads(sub_doc_json)
+        for _doc in sub_doc:
+            if "win_tenderer" in _doc:
+                return _doc["win_tenderer"]
+    return ""
+
+
+def exportDocument_by_keywords(page_time,
+                      list_keyword = ["创客","STEAM","人工智能","课程服务","机器人中学","机器人小学","机器人幼儿园","机器人学校","Labplus","盛思","makeblock柴火","寓乐湾","美科科技","STEAM","能力风暴","优必选","蘑菇云","Dfrobot","中鸣","飞瑞敖","编程猫培生","八爪鱼","八爪鱼教育","童心制物"]):
+
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+
+    for _k in list_keyword:
+        task_queue.put(_k)
+
+
+    def _handle(keyword,result_queue):
+
+        should_queries = []
+        for _temp in [keyword]:
+            should_queries.append(MatchPhraseQuery("doctitle",_temp))
+
+        bool_query_keyword = BoolQuery(should_queries=should_queries)
+
+        ots_client = getConnect_ots()
+
+        bool_query = BoolQuery(must_queries=[RangeQuery('publishtime', range_from='2017-12-20'),
+                                             MatchPhraseQuery("doctitle",keyword),
+                                             TermQuery("docchannel","101")
+                                             ])
+
+
+        is_all_succeed = False
+
+
+        _count = 0
+        total_count = 1
+        next_token = None
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
+                                                                          ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
+        for row in rows:
+            _dict = dict()
+            for values in row:
+                for _v in values:
+                    _dict[_v[0]] = _v[1]
+            result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
+        print(keyword,next_token,total_count)
+        while(next_token):
+            try:
+
+                # print(next_token)
+                _count += len(rows)
+                print("%s:%d/%d"%(keyword,_count,total_count))
+                for row in rows:
+                    _dict = dict()
+                    for values in row:
+                        for _v in values:
+                            _dict[_v[0]] = _v[1]
+                    result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
+                rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                                  SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
+                                                                                  ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
+            except Exception as e:
+                traceback.print_exc()
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
+    mt.run()
+    df_data = {"docid":[],"keyword":[],"tenderee":[],"win_tenderer":[]}
+    while(True):
+        try:
+            item = result_queue.get(block=True,timeout=1)
+            for _k in df_data.keys():
+                if _k in item:
+                    df_data[_k].append(item[_k])
+                else:
+                    df_data[_k].append("")
+        except queue.Empty as e:
+            break
+        except Exception as e:
+            traceback.print_exc()
+
+    df = pd.DataFrame(df_data)
+    df.to_csv("../data/exportArticle1_title.csv",columns=["docid","keyword","tenderee","win_tenderer"])
+
+def exportGovement():
+    should_queries1 = []
+    for _temp in ["教育局","地化所","税务局","国土局","学校","大学","中学","小学","幼儿园","医院"]:
+        should_queries1.append(WildcardQuery("tenderee","*%s*"%_temp))
+    should_queries2 = []
+    for _temp in ["浙江","江苏","湖北","西北","陕西","甘肃","青海","宁夏","新疆","重庆","四川","云南","贵州"]:
+        should_queries2.append(WildcardQuery("province","*%s*"%_temp))
+    ots_client = getConnect_ots()
+    page_time = "2020-12"
+    bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=should_queries1),
+                                         BoolQuery(should_queries=should_queries2),
+                                         TermQuery("docchannel","52"),
+                                         RangeQuery("publishtime",page_time)])
+    columns = ["tenderee","tenderee_contact","tenderee_phone"]
+    rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                      SearchQuery(bool_query, limit=100, sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True),
+                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+
+    print(total_count)
+    def getRow(rows,df_data,columns):
+        for row in rows:
+            _dict = dict()
+            for part in row:
+                for item in part:
+                    _dict[item[0]] = item[1]
+            if "tenderee_contact" in _dict and "tenderee_phone" in _dict:
+                for key in columns:
+                    df_data[key].append(_dict.get(key,""))
+    all_rows = 0
+    df_data = {}
+    for key in columns:
+        df_data[key] = []
+    getRow(rows,df_data,columns)
+    _count = 100
+    while(next_token):
+        print(_count,total_count)
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query,next_token=next_token, limit=100,get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        _count += 100
+        getRow(rows,df_data,columns)
+
+    df2 = pd.DataFrame(df_data)
+    df2.to_excel("../data/%s政府招标人导出数据.xlsx"%page_time,columns=columns)
+
+def export_attachment():
+    filename = "../data/销售部-字段名.xlsx"
+
+    df = pd.read_excel(filename)
+
+    list_dict = []
+    for _key in df["关键词"]:
+        if len(_key)>0:
+            list_dict.append({"keyword":_key})
+
+    def _handle(_dict,result_queue,pool_ots):
+        _keyword = _dict["keyword"]
+        ots_client = pool_ots.getConnector()
+
+        keyword_query = BoolQuery(should_queries=[MatchPhraseQuery("doctextcon",_keyword),
+                                                  MatchPhraseQuery("doctitle",_keyword)])
+
+        bool_query = BoolQuery(must_queries=[RangeQuery("status","201","300",include_upper=True,include_lower=True),
+                                             keyword_query])
+        columns = ["dochtmlcon","docid"]
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        print(_keyword,total_count)
+        hit_count = 0
+        def getData(rows,hit_count):
+            list_fields = getRow_ots(rows)
+            for _fields in list_fields:
+                dochtmlcon = _fields["dochtmlcon"]
+                docid = _fields["docid"]
+                _flag = False
+                for url,suffix in re.findall("(http://[^\'\";;\n]+?\.(zip|rar|doc|xls|xlsx|pdf|txt))",dochtmlcon):
+                    try:
+                        result = requests.get(url,stream=True,timeout=100)
+                        if result.status_code==200:
+                            filename = get_file_name(url,result.headers)
+                        with open("../data/attachment/%s_%d_%s"%(_keyword,docid,filename),"wb") as f:
+                            f.write(result.content)
+                        _flag = True
+                    except Exception:
+                        pass
+                if _flag:
+                    hit_count += 1
+            return hit_count
+
+
+        hit_count = getData(rows,hit_count)
+        _count = len(rows)
+        while next_token:
+            rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                              SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
+                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+            _count += len(rows)
+            if _count>20000:
+                break
+            hit_count = getData(rows,hit_count)
+            if hit_count>20:
+                break
+
+        pool_ots.putConnector(ots_client)
+
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+
+    for item in list_dict:
+        task_queue.put(item)
+
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
+    mt.run()
+
+def exportIndustryCount():
+    import codecs
+    time_from = "2020-12-21"
+    time_to = "2020-12-25"
+
+    # dict_channel = {"51":{"type":"公告变更"},
+    #                 "52":{"type":"招标公告"},
+    #                 "101":{"type":"中标信息"},
+    #                 "102":{"type":"招标预告"},
+    #                 "103":{"type":"招标答疑"},
+    #                 "104":{"type":"招标文件"},
+    #                 "105":{"type":"资审结果"},
+    #                 "103":{"type":"招标控制价"},
+    #                 "100":{"type":"未知类型"}}
+
+
+    dict_industry = {}
+    meta_industry = load("../data/class2dalei_menlei.pkl")
+    for _key in meta_industry.keys():
+        dict_industry[_key] = {"type":_key}
+    print(dict_industry.keys())
+    return
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for _key in dict_industry.keys():
+        task_queue.put(dict_industry[_key])
+
+    def _handle(item,result_queue,pool_ots):
+        ots_client = pool_ots.getConnector()
+        bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"]),
+                                             RangeQuery("publishtime",time_from,time_to,include_lower=True,include_upper=True)])
+        columns = ["docid"]
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query, limit=1,get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        item["count"] = total_count
+        columns = ["dochtmlcon"]
+        bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"])])
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query, limit=10,sort=Sort(sorters=[FieldSort("publishtime",SortOrder.ASC)]),get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        for row in rows:
+            _dict = dict()
+            for part in row:
+                for v in part:
+                    _dict[v[0]] = v[1]
+            with codecs.open("../data/industry/%s_%d.html"%(item["type"],_dict["docid"]),"w",encoding="UTF8") as f:
+                f.write(_dict["dochtmlcon"])
+
+
+
+        pool_ots.putConnector(ots_client)
+    pool_ots = ConnectorPool(init_num=20,max_num=30,method_init=getConnect_ots)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
+    mt.run()
+    columns = ["type","count"]
+    df_data = {}
+    for _c in columns:
+        df_data[_c] = []
+    for _indus in dict_industry.keys():
+        for _c in columns:
+            df_data[_c].append(dict_industry[_indus][_c])
+
+    df = pd.DataFrame(df_data)
+    df.to_excel("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
+
+def exportDocument_By_time(time_from,time_to,columns=["docid","doctitle","project_name","dochtmlcon"]):
+    '''
+    :return:
+    '''
+    ost_client = getConnect_ots()
+    last_docid = 0
+    bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
+                                         RangeQuery('status', '201', '300', include_lower=True, include_upper=True)])
+
+    rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
+                                                                      SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
+                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+
+
+    _count = len(rows)
+    df_data = {}
+    def getData(df_data,rows):
+        list_dict = getRow_ots(rows)
+        for _dict in list_dict:
+            for _k,_v in _dict.items():
+                if _k not in df_data:
+                    df_data[_k] = []
+                df_data[_k].append(getLegal_str(_v))
+    getData(df_data,rows)
+    while(next_token):
+        print("%d/%d"%(_count,total_count))
+        rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        _count += len(rows)
+        getData(df_data,rows)
+
+
+    df = pd.DataFrame(df_data)
+    df.to_excel("%s/%s-%s公告信息.xlsx"%(data_path,time_from,time_to),columns=columns)
+
+def processDocument():
+    filename = "../data/2021-01-29-2021-01-29公告信息.xlsx"
+    df = pd.read_excel(filename)
+    df.to_csv("../data/2021-01-29-2021-01-29公告信息.csv")
+    return
+    list_dict = []
+    for docid,doctitle,project_name,dochtmlcon in zip(df["docid"],df["doctitle"],df["project_name"],df["dochtmlcon"]):
+        list_dict.append({"docid":docid,"doctitle":doctitle,"project_name":project_name,"dochtmlcon":dochtmlcon})
+
+    task_queue = queue.Queue()
+    for _dict in list_dict:
+        task_queue.put(_dict)
+    result_queue = queue.Queue()
+
+    def _handle(_dict,result_queue,pool_mysql):
+        conn = pool_mysql.getConnector()
+        cursor = conn.cursor()
+        sql = "insert into test_extract(docid,doctitle,page_time) values(%d,%s,%s)"%(_dict["docid"],_dict["doctitle"],_dict["dochtmlcon"])
+        cursor.execute(sql)
+        conn.commit()
+
+        pool_mysql.putConnector(conn)
+        # url = "http://192.168.2.101:15030"
+        # myheaders = {'Content-Type': 'application/json'}
+        # print(int(_dict["docid"]))
+        # data = {"doc_id":int(_dict["docid"]),"title":_dict["doctitle"],"content":_dict["dochtmlcon"]}
+        # resp = requests.post(url,json=data,headers=myheaders, verify=True)
+        # result = json.loads(resp.content.decode("utf8"),"utf8")
+        # _dict["product"] = result["product"]
+
+    pool_mysql = ConnectorPool(init_num=20,max_num=30,method_init=getConnection_testmysql)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=5,pool_mysql=pool_mysql)
+    mt.run()
+
+    # columns = ["docid","doctitle","project_name","product"]
+    #
+    # df_data = {}
+    # for _c in columns:
+    #     df_data[_c] = []
+    # for _dict in list_dict:
+    #     for _c in columns:
+    #         df_data[_c].append(_dict.get(_c,""))
+    # df = pd.DataFrame(df_data)
+    # df.to_excel("%s.product.xlsx"%(filename),columns=columns)
+
+def export_extract_check():
+    '''
+    :return:导出数据提取校验的结果并生成报告
+    '''
+
+    conn = getConnection_testmysql()
+    cursor = conn.cursor()
+
+    sql = " select docid,json_result from exportdb.extract_check "
+
+    cursor.execute(sql)
+
+
+    dict_global = {}
+
+    df_global = {"key_type":[],"online_count":[],"test_count":[],"diff_count":[],"diff_percent":[]}
+    df_document = {"docid":[]}
+
+    while True:
+        rows = cursor.fetchmany(10000)
+        if not rows:
+            break
+        for docid,json_result in rows:
+            df_document["docid"].append(docid)
+            _result = json.loads(json_result)
+            for k,v in _result.items():
+                key = k.split("_")
+                _key = "_".join(key[:-1])
+                if "punish" in _key or "complainants" in _key or "institutions" in _key:
+                    continue
+                if k not in df_document:
+                    df_document[k] = []
+                df_document[k].append(v)
+                key_type = key[-1]
+                if _key not in dict_global:
+                    dict_global[_key] = {}
+                if key_type not in dict_global[_key]:
+                    dict_global[_key][key_type] = 0
+                if key_type=="diff":
+                    dict_global[_key][key_type] += v
+                if key_type in ("online","test"):
+                    if isinstance(v,str):
+                        if v!="":
+                            dict_global[_key][key_type] += 1
+                    elif isinstance(v,list):
+                        dict_global[_key][key_type] += len(v)
+    for k,v in dict_global.items():
+        df_global["key_type"].append(k)
+        df_global["online_count"].append(v["online"])
+        df_global["test_count"].append(v["test"])
+        df_global["diff_count"].append(v["diff"])
+        df_global["diff_percent"].append(v["diff"]/v["online"] if v["online"]>0 else 0)
+
+    filename = "../data/%s_extract_check.xlsx"%(time.strftime("%Y-%m-%d"))
+    with pd.ExcelWriter(filename) as writer:
+        df1 = pd.DataFrame(df_global)
+        df1.to_excel(writer,sheet_name="global")
+        for k,v in df_document.items():
+            print(k,len(v))
+        df2 = pd.DataFrame(df_document)
+        df2.to_excel(writer,sheet_name="document")
+        writer.save()
+        writer.close()
+
+
+
+def exportDocument_dump():
+    # filename = "../data/重复公告.xlsx"
+    # df = pd.read_excel(filename)
+    ots_client = getConnect_ots()
+    columns = ["docid","docchannel","page_time","web_source_no","doctitle","tenderee","agency","project_code","project_name","sub_docs_json"]
+    df_keys = ["docid","docchannel","page_time","web_source_no","doctitle","doctitle_refine","tenderee","agency","project_code","project_name","bidding_budget","win_bid_price","win_tenderer","URL"]
+    df_data = {}
+    for _key in df_keys:
+        df_data[_key] = []
+
+    bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-03-03"),
+                                         RangeQuery("status",201,300,True,True)])
+
+    rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                      SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
+                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+    def getData(df_data,rows):
+        list_data = getRow_ots(rows)
+        for row in list_data:
+            dict_find = {}
+            for _key in df_keys:
+                dict_find[_key] = 0
+            for _k,_v in row.items():
+                if _k in df_keys:
+                    dict_find[_k] = 1
+                    if _k=="project_code":
+                        _v = '"%s"'%_v
+                    df_data[_k].append(_v)
+            doctitle = row.get("doctitle","")
+            df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '',  doctitle))
+            df_data["URL"].append("http://www.bidizhaobiao.com/info-%d.html"%(row["docid"]))
+            dict_find["URL"] = 1
+            dict_find["doctitle_refine"] = 1
+            sub_docs_json = row.get("sub_docs_json","[{}]")
+            doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
+            if sub_docs_json is not None:
+                for sub_docs in json.loads(sub_docs_json):
+                    for _key_sub_docs in sub_docs.keys():
+                        if _key_sub_docs in doc_columns:
+                            if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
+                                if _key_sub_docs in ["bidding_budget","win_bid_price"]:
+                                    if float(sub_docs[_key_sub_docs])>0:
+                                        doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
+                                else:
+                                    doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
+            for _k,_v in doc_columns.items():
+                dict_find[_k] = 1
+                df_data[_k].append(_v)
+            for _k,_v in dict_find.items():
+                if _v==0:
+                    df_data[_k].append("")
+
+
+
+    _count = len(rows)
+    getData(df_data,rows)
+    while next_token:
+        print("%d/%d"%(_count,total_count))
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        getData(df_data,rows)
+        _count += len(rows)
+
+    # for docid in df["docid"]:
+    #     bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
+    #
+    #     rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+    #                                                                   SearchQuery(bool_query , limit=100, get_total_count=True),
+    #                                                                   ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+    #     list_data = getRow_ots(rows)
+    #     if len(list_data)>0:
+    #         dict_find = {}
+    #         for _key in df_keys:
+    #             dict_find[_key] = 0
+    #         for _k,_v in list_data[0].items():
+    #             if _k in df_keys:
+    #                 dict_find[_k] = 1
+    #                 df_data[_k].append(_v)
+    #         doctitle = list_data[0].get("doctitle","")
+    #         df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '',  doctitle))
+    #         dict_find["doctitle_refine"] = 1
+    #         sub_docs_json = list_data[0].get("sub_docs_json","[{}]")
+    #         doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
+    #         if sub_docs_json is not None:
+    #             for sub_docs in json.loads(sub_docs_json):
+    #                 for _key_sub_docs in sub_docs.keys():
+    #                     if _key_sub_docs in doc_columns:
+    #                         if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
+    #                             if _key_sub_docs in ["bidding_budget","win_bid_price"]:
+    #                                 if float(sub_docs[_key_sub_docs])>0:
+    #                                     doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
+    #                             else:
+    #                                 doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
+    #         for _k,_v in doc_columns.items():
+    #             dict_find[_k] = 1
+    #             df_data[_k].append(_v)
+    #         for _k,_v in dict_find.items():
+    #             if _v==0:
+    #                 df_data[_k].append("")
+    df1 = pd.DataFrame(df_data)
+    df1.to_csv("../data/0303去重.csv",columns=df_keys)
+
+def exportDocument_dump_mysql():
+    conn = getConnection_testmysql()
+
+    cursor = conn.cursor()
+
+    columns = ["project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer","group_id","docid"]
+
+    df_data = {}
+    for _c in columns:
+        df_data[_c] = []
+    sql = " select "+",".join(columns)+" from run_dumplicate_document_his where group_id in (select group_id from run_dumplicate_document_his group by group_id having count(1)>1)"
+    cursor.execute(sql)
+    while True:
+        rows = cursor.fetchmany(100000)
+        if not rows:
+            break
+        for row in rows:
+            for _i in range(len(columns)):
+                df_data[columns[_i]].append(row[_i])
+    df = pd.DataFrame(df_data)
+    df.to_csv("../data/0304去重.csv",columns=["group_id","docid","project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer"])
+
+
+    print(cursor.description)
+
+def getDict_docchannel():
+    conn = getConnection_mysql()
+    cursor = conn.cursor()
+    sql  = "select channel_id,chnlname from sys_channel "
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    _dict = dict()
+    for row in rows:
+        _dict[row[0]] = row[1]
+    return _dict
+
+def exportDocument_by_doctitle():
+    columns = ["docid","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource"]
+
+    dict_channel = getDict_docchannel()
+
+    str_keyword = '''
+    报批技术服务	不动产	测绘	城市更新	档案整理
+房地一体	拆旧复垦	土地规划	城乡规划	村庄规划
+技术服务	技术审查	建设用地增减挂钩	勘察	垦造水田
+不动产数据建库	不动产数据整合	林权调查	土地调查	城市更新数据调查
+不动产平台	测绘系统	地理信息系统	城乡规划信息系统	一张图信息平台
+测绘信息平台	双评价	城市更新研究	垦造水田研究报告	生态修复研究
+土地规划研究	复垦咨询服务	生态修复咨询服务	城乡规划咨询服务	城市更新咨询服务
+勘测定界	多测合一	用地预审	国土规划数据治理	地名普查
+地形图	垦造水田咨询服务	评估	全域土地综合整治	生态修复
+林权数据建库	权属调查	权籍调查		
+
+
+    '''
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+
+    for _keyword in re.split("\s",str_keyword):
+        if len(_keyword.strip())==0:
+            continue
+        task_queue.put({"keyword":_keyword})
+
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+
+
+    def _handle(item,result_queue,pool_ots):
+
+        ots_client = pool_ots.getConnector()
+
+        should_q1 = BoolQuery(should_queries=[MatchPhraseQuery("doctitle",item["keyword"])])
+
+        should_q2 = BoolQuery(should_queries=[WildcardQuery('province', '%s*'%"广东"),
+                                              WildcardQuery('province', '%s*'%"湖南"),
+                                              WildcardQuery('province', '%s*'%"广西")])
+        bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2021-01-01"),
+                                             RangeQuery("status",201,300,True,True),
+                                             TermQuery("docchannel",101),
+                                             should_q1,should_q2])
+
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]), limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        list_data = getRow_ots(rows)
+        for _data in list_data:
+            _data["keyword"] = item["keyword"]
+            result_queue.put(_data)
+
+        _count = len(list_data)
+        while next_token:
+            print("%d/%d"%(_count,total_count))
+            rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                              SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
+                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+            list_data = getRow_ots(rows)
+            _count += len(list_data)
+            for _data in list_data:
+                _data["keyword"] = item["keyword"]
+                result_queue.put(_data)
+
+        pool_ots.putConnector(ots_client)
+    mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
+    mt.run()
+
+    list_item = []
+    try:
+        while True:
+            _dict = result_queue.get(False)
+            list_item.append(_dict)
+    except Exception as e:
+        print(e)
+
+    keys = list_item[0].keys()
+    df_data = {}
+
+    print(len(list_item))
+    set_line = set()
+    for row in list_item:
+        item = {}
+        _dict = row
+
+        set_dict_item(item,"公告id",_dict.get("docid",""))
+        set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
+
+        set_dict_item(item,"公告标题",_dict.get("doctitle",""))
+
+        set_dict_item(item,"省份",_dict.get("province",""))
+        # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
+        set_dict_item(item,"城市",_dict.get("city",""))
+
+        set_dict_item(item,"区",_dict.get("district",""))
+        set_dict_item(item,"关键词",_dict.get("keyword",""))
+
+        set_dict_item(item,"发布时间",_dict.get("page_time",""))
+
+        set_dict_item(item,"项目编号",_dict.get("project_code",""))
+        set_dict_item(item,"招标单位",_dict.get("tenderee",""))
+        set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
+        set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
+        # set_dict_item(item,"代理单位",_dict.get("agency",""))
+        # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
+        # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
+        set_dict_item(item,"公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
+
+        tenderee_in  = "否"
+        tenderee_keyword = "否"
+        if _dict.get("tenderee","")!="":
+            if _dict.get("tenderee","") in _dict.get("doctitle",""):
+                tenderee_in = "是"
+            if _dict.get("keyword","") in _dict.get("tenderee",""):
+                tenderee_keyword = "是"
+        set_dict_item(item,"标题包含招标人",tenderee_in)
+        set_dict_item(item,"招标人含有关键词",tenderee_keyword)
+        sub_docs_json = _dict.get("sub_docs_json")
+        for _doc in json.loads(sub_docs_json):
+            if "win_tenderer" in _doc:
+                set_dict_item(item,"中标单位",_doc["win_tenderer"])
+            # if "win_tenderee_manager" in _doc:
+            #     set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
+            # if "win_tenderee_phone" in _doc:
+            #     set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
+            if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
+                set_dict_item(item,"中标金额(万元)",float(_doc["win_bid_price"])/10000)
+            if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
+                set_dict_item(item,"招标金额(万元)",float(_doc["bidding_budget"])/10000)
+        if "中标单位" not in item:
+            set_dict_item(item,"中标单位","")
+        if "中标金额(万元)" not in item:
+            set_dict_item(item,"中标金额(万元)","")
+        if "招标金额(万元)" not in item:
+            set_dict_item(item,"招标金额(万元)","")
+
+
+        # if "中标单位联系人" not in item:
+        #     set_dict_item(item,"中标单位联系人","")
+        # if "中标单位联系电话" not in item:
+        #     set_dict_item(item,"中标单位联系电话","")
+
+
+        _line = "%s-%s-%s"%(item["项目编号"],item["中标单位"],item["中标金额(万元)"])
+        if _line in set_line:
+            continue
+        # if item["招标金额"]=="":
+        #     continue
+        set_line.add(_line)
+        for k,v in item.items():
+            if k not in df_data:
+                df_data[k] = []
+            df_data[k].append(v)
+
+
+    df = pd.DataFrame(df_data)
+
+    df.to_excel("../data/2021-04-14_export11.xlsx",columns=list_df_columns)
+
+set_columns = set()
+list_df_columns = []
+
+def set_dict_item(_dict,name,v):
+    _dict[name] = getLegal_str(v)
+    if name not in set_columns:
+        set_columns.add(name)
+        list_df_columns.append(getLegal_str(name))
+
+def exportDocument_medicine(start_time,end_time):
+    # filename = "../data/重复公告.xlsx"
+    # df = pd.read_excel(filename)
+    ots_client = getConnect_ots()
+    columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
+
+    dict_channel = getDict_docchannel()
+    def getData(df_data,rows,set_line):
+        list_data = getRow_ots(rows)
+        for row in list_data:
+            item = {}
+            _dict = row
+            set_dict_item(item,"docid",_dict.get("docid",""))
+            set_dict_item(item,"公告标题",_dict.get("doctitle",""))
+            set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
+            set_dict_item(item,"省份",_dict.get("province",""))
+            # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
+            set_dict_item(item,"城市",_dict.get("city",""))
+            set_dict_item(item,"发布时间",_dict.get("page_time",""))
+
+            set_dict_item(item,"项目编号",_dict.get("project_code",""))
+            set_dict_item(item,"招标单位",_dict.get("tenderee",""))
+            set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
+            set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
+            set_dict_item(item,"代理单位",_dict.get("agency",""))
+            set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
+            set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
+            set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
+
+            sub_docs_json = _dict.get("sub_docs_json")
+            for _doc in json.loads(sub_docs_json):
+                if "win_tenderer" in _doc:
+                    set_dict_item(item,"中标单位",_doc["win_tenderer"])
+                if "win_tenderee_manager" in _doc:
+                    set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
+                if "win_tenderee_phone" in _doc:
+                    set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
+                if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
+                    set_dict_item(item,"中标金额",_doc["win_bid_price"])
+                if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
+                    set_dict_item(item,"招标金额",_doc["bidding_budget"])
+            if "招标金额" not in item:
+                set_dict_item(item,"招标金额","")
+            if "中标金额" not in item:
+                set_dict_item(item,"中标金额","")
+            if "中标单位" not in item:
+                set_dict_item(item,"中标单位","")
+            if "中标单位联系人" not in item:
+                set_dict_item(item,"中标单位联系人","")
+            if "中标单位联系电话" not in item:
+                set_dict_item(item,"中标单位联系电话","")
+
+
+            _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
+            # if _line in set_line:
+            #     continue
+            # if item["招标金额"]=="":
+            #     continue
+            set_line.add(_line)
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+
+    # list_province = ["江西","湖南","四川","安徽"]
+    list_province = ["全国"]
+    for _province in list_province:
+        df_data = {}
+
+        str_p = '''
+        智慧医疗系统	医院信息系统	临床路径	医院系统	医院管理软件
+县域医共体	远程医疗	医院管理系统	医疗信息化	临床医疗
+数据集成	云医院	智慧卫生	卫生信息系统	医疗数字化
+临床应用				
+
+            '''
+        list_prov = re.split("\s",str_p)
+        list_mu = []
+        for _p in list_prov:
+            if _p.strip()=="":
+                continue
+            print(_p)
+            list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
+
+        s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
+        list_should_ten = []
+        for _p in re.split("、",s_tenderee):
+            if _p.split()=="":
+                continue
+            list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
+
+        list_should_chan = []
+        list_should_chan.append(TermQuery("docchannel",52))
+        list_should_chan.append(TermQuery("docchannel",101))
+        list_should_chan.append(TermQuery("docchannel",102))
+
+        should_q1 = BoolQuery(should_queries=list_mu)
+        should_q2 = BoolQuery(should_queries=list_should_ten)
+        should_q3 = BoolQuery(should_queries=list_should_chan)
+        bool_query = BoolQuery(must_queries=[RangeQuery("page_time",start_time,end_time,include_lower=True,include_upper=True),
+                                             RangeQuery("status",201,300,True,True),
+                                             should_q1,should_q2,should_q3])
+
+
+        # must_q1 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
+        #                                   MatchPhraseQuery("doctitle","教学器材")])
+        # must_q2 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
+        #                                   MatchPhraseQuery("doctitle","教育设备")])
+        # must_q3 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
+        #                                   MatchPhraseQuery("doctitle","教学器材")])
+        # must_q4 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
+        #                                   MatchPhraseQuery("doctitle","教育设备")])
+        # must_q5 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校信息化")])
+        # must_q6 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明")])
+        # must_q7 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校弱电")])
+        # must_q8 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校机电安装")])
+        # must_q9 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明改造")])
+        # must_q10 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教学"),
+        #                                    MatchPhraseQuery("doctitle","设备")])
+        #
+        # all_should = BoolQuery(should_queries=[must_q1,must_q2,must_q3,must_q4,must_q5,must_q6,must_q7,must_q8,must_q9,must_q10])
+        # must_not_q = BoolQuery(should_queries=[MatchPhraseQuery("doctitle","社区"),
+        #                                        MatchPhraseQuery("doctitle","医院"),
+        #                                        MatchPhraseQuery("doctitle","工业园"),
+        #                                        MatchPhraseQuery("doctitle","营养"),
+        #                                        MatchPhraseQuery("doctitle","厨房设备")])
+        # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01"),
+        #                                      RangeQuery("status",201,300,True,True),
+        #                                      TermQuery("docchannel",101),
+        #                                      WildcardQuery('province', '%s*'%_province),
+        #                                      all_should],
+        #                        must_not_queries=[must_not_q])
+
+
+
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+
+
+        set_line = set()
+        _count = len(rows)
+        getData(df_data,rows,set_line)
+        while next_token:
+            print("%d/%d"%(_count,total_count))
+            rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                              SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
+                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+            getData(df_data,rows,set_line)
+            _count += len(rows)
+            if len(df_data[list(df_data.keys())[0]])>=300:
+                break
+
+        set_enterprise = set()
+        for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
+            set_enterprise.add(_tenderee)
+            set_enterprise.add(_agency)
+            set_enterprise.add(_win_tenderer)
+        if "" in set_enterprise:
+            set_enterprise.remove("")
+        if None in set_enterprise:
+            set_enterprise.remove(None)
+        dict_enterprise = getDictEnterprise(list(set_enterprise))
+        if len(set_enterprise)>0:
+            for _i in range(len(df_data["招标单位"])):
+                _enterprise_name = df_data["招标单位"][_i]
+                if df_data["招标联系人电话"][_i]=="":
+                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                    if contacts is not None:
+                        _person,_phone = getOneContact(contacts)
+                        df_data["招标联系人"][_i] = _person
+                        df_data["招标联系人电话"][_i] = _phone
+
+                _enterprise_name = df_data["代理单位"][_i]
+                if df_data["代理联系人电话"][_i]=="":
+                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                    if contacts is not None:
+                        _person,_phone = getOneContact(contacts)
+                        df_data["代理联系人"][_i] = _person
+                        df_data["代理联系人电话"][_i] = _phone
+
+                _enterprise_name = df_data["中标单位"][_i]
+                if df_data["中标单位联系电话"][_i]=="":
+                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+                    if contacts is not None:
+                        _person,_phone = getOneContact(contacts)
+                        df_data["中标单位联系人"][_i] = _person
+                        df_data["中标单位联系电话"][_i] = _phone
+
+        return df_data
+        df1 = pd.DataFrame(df_data)
+        df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
+
+
+def exportDocument_by_pagetime():
+    # filename = "../data/重复公告.xlsx"
+    # df = pd.read_excel(filename)
+    ots_client = getConnect_ots()
+    columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
+
+    dict_channel = getDict_docchannel()
+    def getData(df_data,rows,set_line):
+        list_data = getRow_ots(rows)
+        for row in list_data:
+            item = {}
+            _dict = row
+            set_dict_item(item,"docid",_dict.get("docid",""))
+            set_dict_item(item,"公告标题",_dict.get("doctitle",""))
+            set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
+            set_dict_item(item,"省份",_dict.get("province",""))
+            # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
+            set_dict_item(item,"城市",_dict.get("city",""))
+            set_dict_item(item,"发布时间",_dict.get("page_time",""))
+
+            set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  _dict.get("doctitle","")))
+
+            set_dict_item(item,"项目编号",_dict.get("project_code",""))
+            set_dict_item(item,"招标单位",_dict.get("tenderee",""))
+            set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
+            set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
+            set_dict_item(item,"代理单位",_dict.get("agency",""))
+            set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
+            set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
+            set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
+
+            sub_docs_json = _dict.get("sub_docs_json")
+            for _doc in json.loads(sub_docs_json):
+                if "win_tenderer" in _doc:
+                    set_dict_item(item,"中标单位",_doc["win_tenderer"])
+                if "win_tenderee_manager" in _doc:
+                    set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
+                if "win_tenderee_phone" in _doc:
+                    set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
+                if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
+                    set_dict_item(item,"中标金额",_doc["win_bid_price"])
+                if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
+                    set_dict_item(item,"招标金额",_doc["bidding_budget"])
+            if "招标金额" not in item:
+                set_dict_item(item,"招标金额","")
+            if "中标金额" not in item:
+                set_dict_item(item,"中标金额","")
+            if "中标单位" not in item:
+                set_dict_item(item,"中标单位","")
+            if "中标单位联系人" not in item:
+                set_dict_item(item,"中标单位联系人","")
+            if "中标单位联系电话" not in item:
+                set_dict_item(item,"中标单位联系电话","")
+
+
+            _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
+            # if _line in set_line:
+            #     continue
+            # if item["招标金额"]=="":
+            #     continue
+            set_line.add(_line)
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+
+    # list_province = ["江西","湖南","四川","安徽"]
+    list_province = ["全国"]
+    for _province in list_province:
+        df_data = {}
+
+        str_p = '''
+        智慧医疗系统	医院信息系统	临床路径	医院系统	医院管理软件
+县域医共体	远程医疗	医院管理系统	医疗信息化	临床医疗
+数据集成	云医院	智慧卫生	卫生信息系统	医疗数字化
+临床应用				
+
+            '''
+        list_prov = re.split("\s",str_p)
+        list_mu = []
+        for _p in list_prov:
+            if _p.strip()=="":
+                continue
+            print(_p)
+            list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
+
+        s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
+        list_should_ten = []
+        for _p in re.split("、",s_tenderee):
+            if _p.split()=="":
+                continue
+            list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
+
+        list_should_chan = []
+        list_should_chan.append(TermQuery("docchannel",52))
+        # list_should_chan.append(TermQuery("docchannel",101))
+        # list_should_chan.append(TermQuery("docchannel",102))
+
+        should_q1 = BoolQuery(should_queries=list_mu)
+        should_q2 = BoolQuery(should_queries=list_should_ten)
+        should_q3 = BoolQuery(should_queries=list_should_chan)
+        bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2021-05-07","2021-05-07",include_lower=True,include_upper=True),
+                                             RangeQuery("status",201,300,True,True)
+            # ,should_q1
+            # ,should_q2
+            ,should_q3])
+
+
+        # must_q1 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
+        #                                   MatchPhraseQuery("doctitle","教学器材")])
+        # must_q2 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教育局"),
+        #                                   MatchPhraseQuery("doctitle","教育设备")])
+        # must_q3 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
+        #                                   MatchPhraseQuery("doctitle","教学器材")])
+        # must_q4 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","体育局"),
+        #                                   MatchPhraseQuery("doctitle","教育设备")])
+        # must_q5 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校信息化")])
+        # must_q6 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明")])
+        # must_q7 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校弱电")])
+        # must_q8 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校机电安装")])
+        # must_q9 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","学校照明改造")])
+        # must_q10 = BoolQuery(must_queries=[MatchPhraseQuery("doctitle","教学"),
+        #                                    MatchPhraseQuery("doctitle","设备")])
+        #
+        # all_should = BoolQuery(should_queries=[must_q1,must_q2,must_q3,must_q4,must_q5,must_q6,must_q7,must_q8,must_q9,must_q10])
+        # must_not_q = BoolQuery(should_queries=[MatchPhraseQuery("doctitle","社区"),
+        #                                        MatchPhraseQuery("doctitle","医院"),
+        #                                        MatchPhraseQuery("doctitle","工业园"),
+        #                                        MatchPhraseQuery("doctitle","营养"),
+        #                                        MatchPhraseQuery("doctitle","厨房设备")])
+        # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01"),
+        #                                      RangeQuery("status",201,300,True,True),
+        #                                      TermQuery("docchannel",101),
+        #                                      WildcardQuery('province', '%s*'%_province),
+        #                                      all_should],
+        #                        must_not_queries=[must_not_q])
+
+
+
+        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+
+
+        set_line = set()
+        _count = len(rows)
+        getData(df_data,rows,set_line)
+        while next_token:
+            print("%d/%d"%(_count,total_count))
+            rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
+                                                                              SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
+                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+            getData(df_data,rows,set_line)
+            _count += len(rows)
+            # if len(df_data[list(df_data.keys())[0]])>=300:
+            #     break
+
+        # set_enterprise = set()
+        # for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
+        #     set_enterprise.add(_tenderee)
+        #     set_enterprise.add(_agency)
+        #     set_enterprise.add(_win_tenderer)
+        # if "" in set_enterprise:
+        #     set_enterprise.remove("")
+        # if None in set_enterprise:
+        #     set_enterprise.remove(None)
+        # dict_enterprise = getDictEnterprise(list(set_enterprise))
+        # if len(set_enterprise)>0:
+        #     for _i in range(len(df_data["招标单位"])):
+        #         _enterprise_name = df_data["招标单位"][_i]
+        #         if df_data["招标联系人电话"][_i]=="":
+        #             contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+        #             if contacts is not None:
+        #                 _person,_phone = getOneContact(contacts)
+        #                 df_data["招标联系人"][_i] = _person
+        #                 df_data["招标联系人电话"][_i] = _phone
+        #
+        #         _enterprise_name = df_data["代理单位"][_i]
+        #         if df_data["代理联系人电话"][_i]=="":
+        #             contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+        #             if contacts is not None:
+        #                 _person,_phone = getOneContact(contacts)
+        #                 df_data["代理联系人"][_i] = _person
+        #                 df_data["代理联系人电话"][_i] = _phone
+        #
+        #         _enterprise_name = df_data["中标单位"][_i]
+        #         if df_data["中标单位联系电话"][_i]=="":
+        #             contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
+        #             if contacts is not None:
+        #                 _person,_phone = getOneContact(contacts)
+        #                 df_data["中标单位联系人"][_i] = _person
+        #                 df_data["中标单位联系电话"][_i] = _phone
+
+
+        df1 = pd.DataFrame(df_data)
+        df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
+
+def attachCompanyContact():
+    files = ["../data/2021-03-17_四川_关键词导出.csv",
+             "../data/2021-03-17_安徽_关键词导出.csv",
+             "../data/2021-03-17_江西_关键词导出.csv",
+             "../data/2021-03-17_湖南_关键词导出.csv"]
+
+    files = ["../data/欧科自然资源5w以上数据.xlsx"]
+
+    def _handle(item,result_queue,pool_ots):
+        ots_client = pool_ots.getConnector()
+
+
+        primary_key = [('name',str(item["招标单位"]))]
+        columns_to_get = ["province","city","district"]
+        consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
+
+
+
+        # bool_query = BoolQuery(must_queries=[TermQuery("nicknames",item["中标单位"])])
+        #
+        # rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+        #                                                                   SearchQuery(bool_query , limit=1, get_total_count=True),
+        #                                                                   ColumnsToGet(["contacts"],return_type=ColumnReturnType.SPECIFIED))
+        # _docid = int(item["docid"])
+        # partitionkey = _docid%500+1
+        # primary_key = [('partitionkey',partitionkey),("docid",_docid)]
+        # columns_to_get = ["doctitle"]
+        # consumed, return_row, next_token = ots_client.get_row("document",primary_key, columns_to_get, None, 1)
+        if isinstance(item["招标单位"],str) and item["招标单位"]!="":
+            if return_row is not None:
+                _dict = getRow_ots_primary(return_row)
+                # item["doctitle"] = _dict.get("doctitle","")
+                item["招标人省份"] = _dict.get("province","")
+                item["招标人城市"] = _dict.get("city","")
+                item["招标人区域"] = _dict.get("district","")
+
+                province,city,district = getLocation(item["招标单位"])
+                if item["招标人省份"]=="" or item["招标人省份"]=="未知":
+                    item["招标人省份"] = province
+                if item["招标人城市"]=="" or item["招标人城市"]=="未知":
+                    item["招标人城市"] = city
+                if item["招标人区域"]=="" or item["招标人区域"]=="未知":
+                    item["招标人区域"] = district
+
+            else:
+                province,city,district = getLocation(item["招标单位"])
+                item["招标人省份"] = province
+                item["招标人城市"] = city
+                item["招标人区域"] = district
+
+        else:
+            item["招标人省份"] = item["省份"]
+            item["招标人城市"] = item["城市"]
+            item["招标人区域"] = item["区"]
+            # contacts = json.loads(_dict["contacts"])
+            # contacts.sort(key=lambda x:x["level"],reverse=True)
+            # phone = ""
+            # phone_person = ""
+            # mobile = ""
+            # mobile_person = ""
+            # for contact in contacts:
+            #     if mobile=="" and contact.get("mobile_no","")!="":
+            #         mobile = contact.get("mobile_no","")
+            #         mobile_person = contact.get("contact_person","")
+            #     if phone=="" and contact.get("phone_no","")!="":
+            #         phone = contact.get("phone_no",'')
+            #         phone_person = contact.get("contact_person","")
+            # item["招标联系人"] = ""
+            # item["招标联系人电话"] = ""
+            # if mobile!="":
+            #     item["招标联系人"] = mobile_person
+            #     item["招标联系人电话"] = mobile
+            # else:
+            #     item["中标单位联系人"] = phone_person
+            #     item["中标单位联系电话"] = phone
+        pool_ots.putConnector(ots_client)
+
+
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+    for file in files:
+        task_queue = queue.Queue()
+        df = pd.read_excel(file,encoding="UTF8")
+        keys = df.keys()[1:]
+        list_item = []
+        for row in df.itertuples():
+            _dict = {}
+            for _key in keys:
+                if  _key in dir(row):
+                    _v = row.__getattribute__(_key)
+                else:
+                    _v = ''
+                _dict[_key] = _v
+            list_item.append(_dict)
+        for item in list_item:
+            task_queue.put(item)
+
+        mt = MultiThreadHandler(task_queue,_handle,None,30,pool_ots=pool_ots)
+        mt.run()
+        df_data = {}
+        for _k in keys:
+            df_data[_k] = []
+        for item in list_item:
+            for _k in keys:
+                df_data[_k].append(getLegal_str(item.get(_k,"-")))
+        df1 = pd.DataFrame(df_data)
+        df1.to_excel("%s_attach.xlsx"%file,columns=keys)
+
+def dumpWebSourceNo():
+    conn_oracle = getConnection_oracle()
+    cursor_oracle = conn_oracle.cursor()
+    sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO "
+    cursor_oracle.execute(sql)
+    rows = cursor_oracle.fetchall()
+
+    conn_mysql = getConnection_testmysql()
+    cursor_mysql = conn_mysql.cursor()
+    for row in rows:
+        sql = " insert into webSource(web_source_no,web_source_name) values('%s','%s')"%(row[0],row[1])
+        print(sql)
+        cursor_mysql.execute(sql)
+    conn_mysql.commit()
+
+def exportNzj():
+    # filename = "../data/重复公告.xlsx"
+    # df = pd.read_excel(filename)
+    ots_client = getConnect_ots()
+    columns = ["contacts","covered_area","follows","docids","page_time","progress","project_description","project_follow","project_code","project_name","project_type"]
+
+    def getData(df_data,rows,set_line):
+        list_data = getRow_ots(rows)
+        for row in list_data:
+            item = {}
+            _dict = row
+            set_dict_item(item,"docids",_dict.get("docids",""))
+            set_dict_item(item,"contacts",_dict.get("contacts",""))
+            set_dict_item(item,"covered_area",_dict.get("covered_area",""))
+            set_dict_item(item,"follows",_dict.get("follows",""))
+            set_dict_item(item,"project_type",_dict.get("project_type",""))
+            # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
+            set_dict_item(item,"page_time",_dict.get("page_time",""))
+            set_dict_item(item,"progress",_dict.get("progress",""))
+
+            set_dict_item(item,"project_description",_dict.get("project_description",""))
+            set_dict_item(item,"project_follow",_dict.get("project_follow",""))
+            set_dict_item(item,"project_code",_dict.get("project_code",""))
+            set_dict_item(item,"project_name",_dict.get("project_name",""))
+
+            for k,v in item.items():
+                if k not in df_data:
+                    df_data[k] = []
+                df_data[k].append(v)
+
+    df_data = {}
+
+
+    bool_query = BoolQuery(must_queries=[ExistsQuery("docids")])
+
+
+    rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
+                                                                      SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
+                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+
+
+    set_line = set()
+    _count = len(rows)
+    getData(df_data,rows,set_line)
+    while next_token:
+        print("%d/%d"%(_count,total_count))
+        rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
+                                                                          SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        getData(df_data,rows,set_line)
+        _count += len(rows)
+
+    df1 = pd.DataFrame(df_data)
+    df1.to_excel("../data/2021-03-31_拟在建数据导出1.xlsx",columns=list_df_columns)
+
+def turn_status():
+    df = pd.read_excel("../data/欧科自然资源5w以上数据.xlsx")
+
+    conn = getConnection_testmysql()
+    cursor = conn.cursor()
+    for docid in df["公告id"]:
+        partitionkey = int(docid)%500+1
+        sql = " insert into turn_status(partitionkey,docid) values(%d,%d)"%(partitionkey,docid)
+        cursor.execute(sql)
+
+    conn.commit()
+
+
+def attachBidding_budget():
+
+    conn_mysql = getConnection_testmysql()
+    cursor = conn_mysql.cursor()
+
+    sql = "select docid from analysis_r2 where bidding_budget=''"
+
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    cursor.execute(sql)
+    rows = cursor.fetchmany(10000)
+    while(rows):
+        for row in rows:
+            task_queue.put(row[0])
+        rows = cursor.fetchmany(10000)
+
+    pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_testmysql)
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+
+    def _handle(item,result_queue,pool_mysql,pool_ots):
+        ots_client = pool_ots.getConnector()
+
+        bool_query = BoolQuery(must_queries=[TermQuery("docids",item)])
+        rows, next_token, total_count, is_all_succeed = ots_client.search("project2", "project2_index",
+                                                                          SearchQuery(bool_query , limit=1, get_total_count=True),
+                                                                          ColumnsToGet(["bidding_budget"],return_type=ColumnReturnType.SPECIFIED))
+
+        list_dict = getRow_ots(rows)
+        if len(list_dict)>0:
+            conn = pool_mysql.getConnector()
+            cursor = conn.cursor()
+
+            sql = " update analysis_r2 set bidding_budget='%s' where docid=%d"%(str(list_dict[0].get("bidding_budget","")),item)
+            cursor.execute(sql)
+            conn.commit()
+
+            pool_mysql.putConnector(conn)
+
+
+
+        pool_ots.putConnector(ots_client)
+
+    mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=pool_mysql,pool_ots=pool_ots)
+    mt.run()
+
+
+def debug_documentMerge():
+    conn = getConnection_testmysql()
+
+    cursor = conn.cursor()
+
+    sql = "select merge_docids from project_group_final_log "
+    cursor.execute(sql)
+
+    task_queue = queue.Queue()
+    for row in cursor.fetchall():
+        task_queue.put(row[0])
+
+    print(task_queue.qsize())
+    def _handle(item,result_queue,pool_ots):
+        ots_client = pool_ots.getConnector()
+        list_docids = item.split(",")
+
+        must_q = []
+        for _docid in list_docids:
+            must_q.append(TermQuery("docids",_docid))
+        bool_query = BoolQuery(must_queries=must_q)
+
+        rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
+                                                                       SearchQuery(bool_query,limit=1,get_total_count=True),
+                                                                       ColumnsToGet(column_names=["docids"],return_type=ColumnReturnType.SPECIFIED))
+        if total_count==0:
+            print(item)
+            result_queue.put(item)
+        pool_ots.putConnector(ots_client)
+    result_queue = queue.Queue()
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+    mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
+    mt.run()
+    while(True):
+        try:
+            item = result_queue.get(True)
+            print(item)
+        except Exception as e:
+            print(str(e))
+            break
+
+def signDocument():
+    filename = "C:\\Users\\Administrator\\Desktop\\中标信息1.xlsx"
+    sign_filename = "%s_sign.xlsx"%filename
+    df = pd.read_excel(filename)
+    df_data = {"sign":[]}
+
+    for item in df["segword"]:
+        content = re.sub("\s*","",item)
+        _find = re.search("(?P<key>流标|废标|中止|终止|撤销|采购失败)",content)
+        if _find is not None:
+            df_data["sign"].append(_find.groupdict().get("key"))
+        else:
+            df_data["sign"].append("无")
+    df1 = pd.DataFrame(df_data)
+    df1.to_excel(sign_filename)
+
+if __name__=="__main__":
+    # exportDocument_By_time(time_from="2021-01-29",time_to="2021-01-29",columns=["docid","doctitle","project_name","dochtmlcon"])
+    # processDocument()
+    # export_extract_check()
+    # exportArticle_by_websource()
+    # export_keyword_count()
+    # export_province_keyword_count()
+    # exportDocument_dump()
+    # exportDocument_dump_mysql()
+    # export_attachment()
+    # exportDocument_by_doctitle()
+    # exportIndustryCount()
+    # exportDocument_by_pagetime()
+    # attachCompanyContact()
+    # dumpWebSourceNo()
+    # print("http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%138306357))))
+    # exportNzj()
+    # turn_status()
+    # attachBidding_budget()
+    # debug_documentMerge()
+    exportDocument_medicine("2021-05-02","2021-05-08")
+    # signDocument()
+
+

+ 619 - 0
export/exportEnterprice.py

@@ -0,0 +1,619 @@
+#coding:UTF8
+
+import sys
+import os
+sys.path.append("../")
+
+import pandas as pd
+from dataSource.source import *
+import json
+from utils.multiThread import MultiThreadHandler
+import queue
+from utils.Utils import *
+from dataSource.pool import ConnectorPool
+import re
+from tablestore import *
+import traceback
+
+
+data_path = "../data/"
+
+def getCompanys():
+    list_company = []
+    keywords = ["环境","生态","再生","回收","环保"]
+    provinces = ["广东"]
+    for _name in keywords:
+        for _prov in provinces:
+            data = make_elasticSearch({
+                "query": {
+                    "bool": {
+                        "must": [
+                            {
+                                "wildcard": {
+                                    "name.keyword": "*%s*"%_name
+                                }
+                            }
+                            # ,
+                            # {
+                            #     "term": {
+                            #         "province.keyword": "%s"%_prov
+                            #     }
+                            # }
+                            # ,
+                            # {
+                            #     "range": {
+                            #         "zhongBiaoNumber": {
+                            #             "gt": "0"
+                            #         }
+                            #     }
+                            # }
+                        ],
+                        "must_not": [ ],
+                        "should": [ ]
+                    }
+                },
+                "from": 0,
+                "size": 1000000,
+                "sort": [ ],
+                "aggs": { }
+            })
+            print("--",data["hits"]["total"])
+            for item in data["hits"]["hits"]:
+                _company = {"enterprise_name":"","regCapital":"","legal_person":"","phone":"","industry":"","province":""}
+                _company["enterprise_name"] = item["_source"].get("name","")
+                _company["regCapital"] = item["_source"].get("regCapital","")
+                _company["zhongBiaoNumber"] = item["_source"].get("zhongBiaoNumber","0")
+                list_company.append(_company)
+    # data = make_elasticSearch({
+    #     "query": {
+    #         "bool": {
+    #             "must": [
+    #                 {
+    #                     "wildcard": {
+    #                         "name.keyword": "*电商*"
+    #                     }
+    #                 }
+    #                 ,
+    #                 {
+    #                     "term": {
+    #                         "province.keyword": "北京"
+    #                     }
+    #                 }
+    #                 ,
+    #                 {
+    #                     "range": {
+    #                         "zhongBiaoNumber": {
+    #                             "gt": "0"
+    #                         }
+    #                     }
+    #                 }
+    #             ],
+    #             "must_not": [ ],
+    #             "should": [ ]
+    #         }
+    #     },
+    #     "from": 0,
+    #     "size": 10000,
+    #     "sort": [ ],
+    #     "aggs": { }
+    # })
+    #
+    # for item in data["hits"]["hits"]:
+    #     _company = {"enterprise_name":"","regCapital":"","legal_person":"","phone":"","industry":"","province":""}
+    #     _company["enterprise_name"] = item["_source"].get("name","")
+    #     _company["regCapital"] = item["_source"].get("regCapital","")
+    #     list_company.append(_company)
+    print(len(list_company))
+    return list_company
+
+def exportFactory():
+    def _handle(item,result_queue,pool_mongo,pool_neo4j):
+        company_name = item["enterprise_name"]
+        mongo = pool_mongo.getConnector()
+        coll_zb = mongo.enterprise_profile
+        rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1, "actualCapital":1,"estiblishTime":1,"legal_person":1,"phone":1 })
+        _flag = False
+        for row in rows:
+            actualCapital = row.get("actualCapital","0")
+            estiblishTime = row.get("estiblishTime","2020-01-01")
+            _captial = re.match("\d+[亿万]+",actualCapital)
+            # if _captial is not None:
+            # if getUnifyMoney(_captial.group())>getUnifyMoney("5000万"):
+            # if estiblishTime<="2015-10-09":
+            item["legal_person"] = row.get("legal_person","")
+            item["phone"] = row.get("phone","")
+            item["actualCapital"] = actualCapital
+            item["estiblishTime"] = row.get("estiblishTime","")
+            _flag = True
+            break
+        if _flag:
+            result_queue.put(item)
+        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
+        graph = pool_neo4j.getConnector()
+        finded = graph.run(cql)
+        data = json.loads(json.dumps(finded.data()))
+        _count = data[0]["_c"]
+        # list_project = []
+        # for _data in data:
+        #     if _count<=3:
+        #         if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
+        #             if _data["project_name"] is not None:
+        #                 list_project.append(_data["project_name"])
+        #     _count += 1
+        item["count"] = _count
+        pool_mongo.putConnector(mongo)
+        pool_neo4j.putConnector(graph)
+    # list_company = getCompanys()
+    list_company = []
+    filename = "../data/天眼查1(1).xlsx"
+    df1 = pd.read_excel(filename)
+    for item in df1["公司名称"]:
+        list_company.append({"enterprise_name":item,"regCapital":"","legal_person":"","phone":"","industry":"","province":""})
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for item in list_company:
+        task_queue.put(item)
+    pool_mongo = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_mongodb)
+    pool_neo4j = ConnectorPool(init_num=10,max_num=50,method_init=getConnect_neo4j)
+    _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=70,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j)
+    _mult.run()
+    list_name = []
+    list_actualCapital = []
+    list_estiblishTime = []
+    list_legal_person = []
+    list_phone = []
+    list_zb = []
+    while(True):
+        try:
+            item = result_queue.get(False)
+            list_name.append(item["enterprise_name"])
+            list_actualCapital.append(item["actualCapital"])
+            list_estiblishTime.append(item["estiblishTime"])
+            list_legal_person.append(item["legal_person"])
+            list_phone.append(item["phone"])
+            list_zb.append(item["count"])
+        except:
+            break
+    df = pd.DataFrame({"公司":list_name,"实缴":list_actualCapital,
+                       "注册时间":list_estiblishTime,"联系人":list_legal_person,"联系电话":list_phone,
+                       "中标次数":list_zb})
+    df.to_excel("%s"%filename+"_export.xlsx",columns=["公司","实缴","注册时间","联系人","联系电话","中标次数"])
+
+def deal():
+    def _handle(item,result_queue):
+        graph = getConnect_neo4j()
+        company_name = item["enterprise_name"]
+        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhong_biao_page_time as zhong_biao_page_time,p.project_name as project_name order by p.zhong_biao_page_time desc limit 3"%(company_name)
+        finded = graph.run(cql)
+        data = json.loads(json.dumps(finded.data()))
+        _count = 1
+        list_project = []
+        for _data in data:
+            if _count<=3:
+                if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
+                    list_project.append(_data["project_name"])
+            _count += 1
+        item["project"] = str(list_project)
+        result_queue.put(item)
+    file = "../data/北京行业_export.xls"
+    df = pd.read_excel(file)
+    list_company = []
+    for _company,rep,industry,project,count,person,phone in zip(df["公司名字"],df["注册资金"],df["行业"],df["中标项目"],df["中标次数"],df["联系人"],df["联系电话"]):
+        list_company.append({"enterprise_name":_company,"regCapital":rep,"legal_person":person,"phone":phone,"industry":industry,"province":"","count":count})
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for item in list_company:
+        task_queue.put(item)
+    _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
+    _mult.run()
+    list_name = []
+    list_regCapital = []
+    list_industry = []
+    list_count = []
+    list_person = []
+    list_phone = []
+    list_project = []
+    while(True):
+
+        try:
+            _result = result_queue.get(False)
+            list_name.append(_result["enterprise_name"])
+            list_regCapital.append(_result["regCapital"])
+            list_industry.append(_result["industry"])
+            list_count.append(_result["count"])
+            list_person.append(_result["legal_person"])
+            list_phone.append(_result["phone"])
+            list_project.append(_result["project"])
+        except Exception as e:
+            print(e)
+            break
+    df1 = pd.DataFrame({"公司名字":list_name,"注册资金":list_regCapital,"行业":list_industry,"中标项目":list_project,"中标次数":list_count,"联系人":list_person,"联系电话":list_phone})
+    df1.to_excel("%s_export1.xls"%("北京行业"),columns=["公司名字","注册资金","行业","中标项目","中标次数","联系人","联系电话"])
+
+def deal1():
+    def _handle(item,result_queue):
+        graph = getConnect_neo4j()
+        company_name = item["enterprise_name"]
+        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhong_biao_page_time as zhong_biao_page_time,p.project_name as project_name order by p.zhong_biao_page_time desc "%(company_name)
+        finded = graph.run(cql)
+        data = json.loads(json.dumps(finded.data()))
+        _count = 0
+        list_project = []
+        for _data in data:
+            if _count<=2:
+                if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
+                    list_project.append(_data["project_name"])
+            _count += 1
+        item["count"] = _count
+        item["project"] = str(list_project)
+        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company_name)
+        print(cql)
+        finded = graph.run(cql)
+        finded_money = json.loads(json.dumps(finded.data()))
+        whole_money = 0
+        for _item in finded_money:
+            if _item["r.price"] is not None:
+                whole_money += getUnifyMoney(_item["r.price"])
+        item["whole_money"] = str(whole_money)
+        result_queue.put(item)
+    # filename = "数据导出需求9.11(1)(1).xlsx"
+    filename = "../data/新建 XLSX 工作表(1).xlsx"
+    df = pd.read_excel(filename)
+    list_company = []
+    for _key in df.keys():
+        print(_key,len(df[_key]))
+    for _company in df["公司名称"]:
+        list_company.append({"enterprise_name":_company,"regCapital":"","legal_person":"","phone":"","industry":"","province":"","count":0})
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for item in list_company:
+        task_queue.put(item)
+    _mult = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
+    _mult.run()
+    _dict_item = {}
+    while(True):
+        try:
+            item = result_queue.get(False)
+            if item["enterprise_name"]!="":
+                _dict_item[item["enterprise_name"]] = item
+        except Exception as e:
+            print(str(e))
+            break
+    list_count = []
+    list_project = []
+    list_money = []
+    list_zb = []
+    for _company in df["公司名称"]:
+        if _company in _dict_item:
+            list_count.append(_dict_item[_company]["count"])
+            list_project.append(_dict_item[_company]["project"])
+            list_money.append(_dict_item[_company]["whole_money"])
+            list_zb.append("是" if _dict_item[_company]["count"]>0 else "否")
+        else:
+            print(_company)
+            list_count.append(0)
+            list_project.append("")
+            list_money.append("0")
+            list_zb.append("否")
+
+    print(len(list_count),len(list_project),len(list_money),len(list_zb))
+    df2 = pd.DataFrame({"公司名称":df["公司名称"],"次数":list_count})
+    df2.to_excel("%s_export.xls"%filename)
+    # df1 = pd.DataFrame({"月份":df["月份"],"电话":df["电话"],"公司名字":df["公司名字"],"开通时间":df["开通时间"],
+    #                     "到期时间":df["到期时间"],"客户公司注册时间":df["客户公司注册时间"],"客户公司注册资金":df["客户公司注册资金"],
+    #                     "实际缴费资金":df["实际缴费资金"],"天眼查行业分类":df["天眼查行业分类"],"是否中标":list_zb,
+    #                     "中标次数":list_count,"中标项目|3个":list_project,"中标金额":list_money,"客户设置关键词":df["客户设置关键词"],"客户搜索词":df["客户搜索词"].xls})
+    # df1.to_excel("%s_补充.xls"%filename,columns=["月份","电话","公司名字",	"开通时间"	,"到期时间"	,"客户公司注册时间"	,"客户公司注册资金"	,"实际缴费资金"	,"天眼查行业分类"	,"是否中标"	,"中标次数"	,"中标项目|3个"	,"中标金额"	,"客户设置关键词"	,"客户搜索词"])
+
+def deal3():
+    filename = "../data/导出工厂.xlsx"
+    df = pd.DataFrame(filename)
+    count = 0
+    for item in df["实缴"]:
+        if getUnifyMoney(item)>getUnifyMoney("5000万"):
+            count += 1
+            print(count)
+
+def exportEnterpriseByName():
+    df = pd.read_csv("../data/中标家具公司.csv",encoding="GBK")
+
+    def _handle(item,result_queue,pool_ots):
+        ots_client = pool_ots.getConnector()
+
+        primary_key = [('name',str(item["name"]))]
+
+        columns_to_get = ["reg_capital","actual_capital","contacts","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]
+
+        consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
+
+        print(return_row)
+
+        for _item in return_row.attribute_columns:
+            if _item[0]=="contacts":
+                a = json.loads(_item[1])
+                for i in a:
+                    if i.get("mobile_no","")==item["phone"] or i.get("phone_no","")==item["phone"]:
+                        item["contact_person"] = i.get("contact_person","")
+            else:
+                item[_item[0]] = _item[1]
+
+    list_dict = []
+    for name,phone in zip(df["name"],df["phone"]):
+        list_dict.append({"name":name,"phone":phone})
+
+    task_queue = queue.Queue()
+    for item in list_dict:
+        task_queue.put(item)
+
+    result_queue = queue.Queue()
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=70,pool_ots=pool_ots)
+    mt.run()
+
+    columns = ["name","contact_person","phone","reg_capital","actual_capital","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]
+    df_data = {}
+    for _c in columns:
+        df_data[_c] = []
+    for item in list_dict:
+        for _key in columns:
+            df_data[_key].append(item.get(_key,""))
+    df1 = pd.DataFrame(df_data)
+    df1.to_csv("中标家具公司1.csv")
+
+def getCompanys():
+    conn = getConnection_mysql()
+    cursor = conn.cursor()
+    sql = '''select C.login_id as 登陆名,B.company ,B.contactname as 联系人,B.phone as 联系电话 ,(select MLEVELNAME from sys_memberlevel where id =A.memberlevelid) as 会员等级,( select name from b2c_mall_staff_basic_info where userid=B.aftermarket) as 售后客服   from bxkc.bxkc_member_term A,bxkc.b2c_mall_staff_basic_info B,bxkc.b2c_user_login_info C
+where A.USERID=B.USERID and B.USERID=C.USERID and B.innerOrg like '广州%'
+and A.memberlevelid!=81 and A.status='01' and str_to_date('2020-11-20','%Y-%m-%d') between  A.stime and A.etiem ;
+'''
+    cursor.execute(sql)
+    vol = cursor.description
+    list_company = []
+    rows = cursor.fetchall()
+    for row in rows:
+        _company = {}
+        for _vol,_value in zip(vol,row):
+            _name = _vol[0]
+            _company[_name] = _value
+        list_company.append(_company)
+    return list_company
+
+def exportEnterprise_byindustry(page_time,
+                                columns = ["name","address","business_scope","province","city","district","reg_capital","phone","estiblish_time"],
+                                keywords = ["钢材","水泥","五金","水电","暖通","暖气","电缆"]):
+
+    list_should_q = []
+    for _key in keywords:
+        list_should_q.append(WildcardQuery("industry","*%s*"%_key))
+        list_should_q.append(WildcardQuery("nicknames","*%s*"%_key))
+    key_query = BoolQuery(should_queries=list_should_q)
+
+    #WildcardQuery("industry","*建筑*")
+    ots_client = getConnect_ots()
+    bool_query = BoolQuery(must_queries=[RangeQuery("bidi_id",0,include_lower=True),
+                                         key_query,
+                                         RangeQuery("estiblish_time",range_to="2017-01-01")])
+
+    rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+                                                                      SearchQuery(bool_query, limit=100, get_total_count=True),
+                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+    all_rows = 0
+    df_data = {}
+    for key in columns:
+        df_data[key] = []
+    for row in rows:
+        _dict = dict()
+        for part in row:
+            for item in part:
+                _dict[item[0]] = item[1]
+        for key in columns:
+            df_data[key].append(_dict.get(key,""))
+        # if "reg_capital" in _dict:
+        #     _money = re.match("\d+[万亿千百十]",_dict["reg_capital"])
+        #     if _money is not None:
+        #         if getUnifyMoney(_money.group())>2000000:
+        #             for key in columns:
+        #                 df_data[key].append(_dict.get(key,""))
+    all_rows += len(rows)
+
+    # print(next_token)
+    while(next_token):
+        rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+                                                                          SearchQuery(bool_query, next_token=next_token,limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        for row in rows:
+            _dict = dict()
+            for part in row:
+                for item in part:
+                    _dict[item[0]] = item[1]
+            for key in columns:
+                df_data[key].append(_dict.get(key,""))
+        # if "reg_capital" in _dict:
+        #     _money = re.match("\d+[万亿千百十]",_dict["reg_capital"])
+        #     if _money is not None:
+        #         if getUnifyMoney(_money.group())>2000000:
+        #             for key in columns:
+        #                 df_data[key].append(_dict.get(key,""))
+        all_rows += len(rows)
+        print(all_rows,total_count,len(df_data[columns[0]]))
+    df = pd.DataFrame(df_data)
+    df.to_csv("../data/enterprise_2017_a.csv",columns=columns)
+
+
+def getTyc_company():
+    root_path = ["G:/文档/tyc国企","G:/文档/tyc机构"]
+    list_files = []
+    for _path in root_path:
+        for file in os.listdir(_path):
+            list_files.append(os.path.join(_path,file))
+
+    list_files = ["G:/文档/tyc机构\\高级搜索导出数据结果—自定义条件—天眼查(W20011656561610789770227).xlsx"]
+
+    pool_mysql = ConnectorPool(method_init=getConnection_testmysql,init_num=10,max_num=30)
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for _file in list_files:
+        task_queue.put(_file)
+
+    def _handle(_file,task_queue,pool_mysql):
+        print("handle",_file)
+        conn = pool_mysql.getConnector()
+        cursor = conn.cursor()
+        df = pd.read_excel(_file,header=2)
+        for name,social_credit,identification,regist_num,organization_code in zip(df["公司名称"],df["统一社会信用代码"],df["纳税人识别号"],df["注册号"],df["组织机构代码"]):
+            try:
+                sql = " insert into Enterprise(name,social_credit,identification,regist_num,organization_code) values ('%s','%s','%s','%s','%s')"%(name,social_credit,identification,regist_num,organization_code)
+                cursor.execute(sql)
+            except Exception as e:
+                print("error")
+        conn.commit()
+        pool_mysql.putConnector(conn)
+
+    mt = MultiThreadHandler(task_queue,_handle,result_queue,20,pool_mysql=pool_mysql)
+    mt.run()
+
+def exportEnterprise_by_bidNum():
+
+    ots_client = getConnect_ots()
+    bool_query = BoolQuery(must_queries=[RangeQuery("tyc_id",1,include_lower=True),
+                                           RangeQuery("bid_number",4,include_lower=True)
+                                           ])
+
+    columns = ["name"]
+    rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+                                                                      SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("tyc_id",SortOrder.ASC)]), limit=100, get_total_count=True),
+                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+    df_data = {}
+    for _key in columns:
+        df_data[_key] = []
+
+    def getData(df_data,rows):
+        list_dict = getRow_ots(rows)
+        for _dict in list_dict:
+            for _key in columns:
+                _v = _dict.get(_key,"")
+                if len(_v)>4:
+                    df_data[_key].append(_v)
+    getData(df_data,rows)
+    _count = len(rows)
+    while(next_token):
+        print("%d/%d"%(_count,total_count))
+        rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
+                                                                          SearchQuery(bool_query, next_token=next_token,limit=100, get_total_count=True),
+                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
+        getData(df_data,rows)
+        _count += len(rows)
+
+    df = pd.DataFrame(df_data)
+    df.to_csv("../data/enterprise_bidinum.csv",columns=columns)
+
+def make_Legal_enterprise():
+    import codecs
+    def format(_e):
+        if _e is None:
+            return None
+        if not isinstance(_e,str):
+            return None
+        if re.search("^[a-zA-Z0-9]+$",_e) is not None:
+            return None
+        if re.search("[<《]>-。\-\.\?]",_e) is not None:
+            return None
+        _e1 = re.sub("\s+","",_e.replace("(","(").replace(")",")"))
+        if re.search("[省市区县乡镇]$",_e) is not None:
+            return None
+        if len(_e1)>=4:
+            return _e1
+        return None
+    set_enterprise = set()
+    df = pd.read_csv("../data/enterprise_bidinum.csv",encoding="GBK")
+
+    _count = 0
+    for _e in df["name"]:
+        _count += 1
+        if _count%10000==0:
+            print(_count)
+        _e1 = format(_e)
+        if _e1 is not None:
+            set_enterprise.add(_e1)
+
+    conn = getConnection_testmysql()
+    cursor = conn.cursor()
+    sql = " select name from Enterprise "
+    cursor.execute(sql)
+    rows = cursor.fetchmany(10000)
+    while rows:
+        for row in rows:
+            _count += 1
+            if _count%10000==0:
+                print(_count)
+            _e = row[0]
+            _e1 = format(_e)
+            if _e1 is not None:
+                set_enterprise.add(_e1)
+        rows = cursor.fetchmany(10000)
+
+    with codecs.open("../data/LEGAL_ENTERPRISE.txt","w",encoding="UTF8") as f:
+        for _e in list(set_enterprise):
+            f.write(_e+"\n")
+
+
+def getDictEnterprise(list_enterprise,columns_to_get = ["reg_capital","actual_capital","contacts","industry","estiblish_time","social_staff_num","business_scope","zhong_biao_number"]):
+    task_queue = queue.Queue()
+    result_queue= queue.Queue()
+
+    for _enterprise in list_enterprise:
+        task_queue.put(_enterprise)
+    def _handle(item,result_queue,pool_ots):
+        ots_client = pool_ots.getConnector()
+        primary_key = [("name",item)]
+        consumed,return_row,next_token = ots_client.get_row("enterprise",primary_key,columns_to_get,None,1)
+        dict_data = getRow_ots_primary(return_row)
+        if dict_data is not None:
+            result_queue.put({item:dict_data})
+
+        pool_ots.putConnector(ots_client)
+
+    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
+    mt.run()
+
+    dict_enterprise = {}
+    while True:
+        try:
+            _dict = result_queue.get(False)
+            for k,v in _dict.items():
+                dict_enterprise[k] = v
+        except Exception as e:
+            break
+    return dict_enterprise
+
+
+
+def getOneContact(contacts,tojson=True,mobile_first=True):
+    if tojson:
+        list_contacts = json.loads(contacts)
+    else:
+        list_contacts = contacts
+    mobile_person = ""
+    mobile_no = ''
+    phone_person = ""
+    phone_no = ''
+    for _contact in list_contacts:
+        if _contact.get("mobile_no","")!="":
+            mobile_person = _contact.get("contact_person","")
+            mobile_no = _contact.get("mobile_no","")
+        if _contact.get("phone_no","")!="":
+            phone_person = _contact.get("phone_no","")
+            phone_no = _contact.get("phone_no","")
+    if mobile_first:
+        return mobile_person,mobile_no
+    return phone_person,phone_no
+
+
+
+
+if __name__=="__main__":
+    # getTyc_company()
+    exportEnterprise_by_bidNum()
+    make_Legal_enterprise()

+ 59 - 0
export/exportUnion.py

@@ -0,0 +1,59 @@
+
+
+def export_fromGDB():
+    df1 = pd.read_csv("../data/exportFind_tenderee.csv",encoding="GBK")
+    l_t = []
+    for name,tenderee in zip(df1["name"],df1["tenderee"]):
+        if tenderee is not None and tenderee!="":
+            list_ten = list(set(tenderee[2:-2].split("', '")))
+            l_t.append(list_ten)
+        else:
+            l_t.append([])
+    df1["tenderer_count"] = l_t
+    df1.to_csv("../data/exportFind_tenderee2.csv",columns=["name","tenderer_count"])
+    return
+
+    import gremlin_python
+    df = pd.read_excel("../data/findTenderee.xlsx")
+    set_win_tenderer = set()
+    for item in df["µ¥Î»Ãû³Æ"]:
+        if item is not None and item !="":
+            set_win_tenderer.add(item)
+
+    df1 = pd.read_csv("../data/exportFind_tenderee.csv",encoding="GBK")
+
+    task_queue = queue.Queue()
+    for _name in list(set_win_tenderer-set(df1["name"])):
+        task_queue.put(_name)
+    result_queue = queue.Queue()
+
+    def _handle(_name,result_queue,pool_gdb):
+        client = pool_gdb.getConnector()
+        callback = client.submitAsync("g.V('%s').outE('ZhongBiaoRelation').inV().inE('ZhaoBiaoRelation').outV()"%(_name))
+        list_tenderee = []
+        for result in callback.result():
+            for item in result:
+                list_tenderee.append(item.id)
+        result_queue.put({"name":_name,"tenderee":list_tenderee})
+        pool_gdb.putConnector(client)
+
+    pool_gdb = ConnectorPool(init_num=30,max_num=50,method_init=getConnect_gdb)
+    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=50,pool_gdb=pool_gdb)
+    mt.run()
+
+    df_data = {"name":[],"tenderee":[]}
+    while(True):
+        try:
+            item = result_queue.get(block=True,timeout=1)
+            for _k in df_data.keys():
+                if _k in item:
+                    df_data[_k].append(str(item[_k]))
+                else:
+                    df_data[_k].append("")
+        except queue.Empty as e:
+            break
+        except Exception as e:
+            traceback.print_exc()
+
+    df = pd.DataFrame(df_data)
+    df.to_csv("../data/exportFind_tenderee1.csv",columns=["name","tenderee"])

+ 120 - 0
export/transformContact.py

@@ -0,0 +1,120 @@
+
+
+import os
+os.environ['NLS_LANG'] = 'AMERICAN_AMERICA.AL32UTF8'
+from dataSource.source import *
+from utils.Utils import *
+
+
+def getContactjson(companyname,contact_person,mobile_no,phone_no,mail):
+    if mobile_no!="" or phone_no!="":
+        _dict = dict()
+        _dict["company"] = companyname
+        _dict["contact_person"] = contact_person
+        _dict["mobile_no"] = mobile_no
+        _dict["phone_no"] = phone_no
+        _dict["mail"] = mail
+        _dict["level"] = 40
+        return re.sub('\'','',json.dumps([_dict],ensure_ascii=False))
+    return json.dumps([])
+
+mobile_pattern = re.compile("^1\d{10}$")
+def recog_likeType(phone):
+    if re.search(mobile_pattern,phone) is not None:
+        return "mobile"
+    else:
+        return "phone"
+
+def transform_contact():
+    conn_target = getConnection_testmysql()
+    conn_source = getConnection_oracle()
+
+    cursor_target = conn_target.cursor()
+    cursor_source = conn_source.cursor()
+
+    # sql = "select companyname,contact,landline,mobilephone from bxkc.HUI_CONG_CONPANYINFO"
+    # cursor_source.execute(sql)
+    #
+    # rows = cursor_source.fetchmany(10)
+    # _count = len(rows)
+    # while rows:
+    #     list_json = []
+    #     for row in rows:
+    #         company = row[0]
+    #         contact_person = row[1]
+    #         phone_no = ""
+    #         if row[2]!='86':
+    #             phone_no = row[2]
+    #         mobile_no = ""
+    #         if row[3] is not None and re.search("\*",row[3]) is None:
+    #             mobile_no = row[3]
+    #         if phone_no!="" or mobile_no!="":
+    #             list_json.append(getContactjson(company,contact_person,mobile_no,phone_no,""))
+    #     sql = "insert into bxkc.company_contact_json(json_contact) values ('"+"'),('".join(list_json)+"')"
+    #     cursor_target.execute(sql)
+    #     rows = cursor_source.fetchmany(50000)
+    #     _count += len(rows)
+    #     print(_count)
+
+    # sql = "select company_name,company_contact_person,company_contact_num,company_mail from bxkc.bxkc_gongyingshang_info"
+    # cursor_source.execute(sql)
+    #
+    # rows = cursor_source.fetchmany(10)
+    # _count = len(rows)
+    # while rows:
+    #     list_json = []
+    #     for row in rows:
+    #         company = row[0]
+    #         contact_person = row[1]
+    #         phone_no = ""
+    #         mobile_no = ""
+    #         company_contact_num = row[2]
+    #         company_mail = row[3]
+    #         if company_contact_num is not None:
+    #             if recog_likeType(company_contact_num)=="mobile":
+    #                 mobile_no = company_contact_num
+    #             else:
+    #                 phone_no = company_contact_num
+    #         if company_mail is not None:
+    #             mail = company_mail
+    #         else:
+    #             mail = ''
+    #         if phone_no!="" or mobile_no!="":
+    #             list_json.append(getContactjson(company,contact_person,mobile_no,phone_no,mail))
+    #     sql = "insert into bxkc.company_contact_json(json_contact) values ('"+"'),('".join(list_json)+"')"
+    #     cursor_target.execute(sql)
+    #     rows = cursor_source.fetchmany(50000)
+    #     _count += len(rows)
+    #     print(_count)
+
+    sql = "select company_name,contact_person,mobile_no from bxkc.bxkc_enterprise_listing_info"
+    cursor_source.execute(sql)
+
+    rows = cursor_source.fetchmany(10)
+    _count = len(rows)
+    while rows:
+        list_json = []
+        for row in rows:
+            company = row[0]
+            contact_person = row[1]
+            phone_no = ""
+            mobile_no = ""
+            company_contact_num = row[2]
+            if company_contact_num is not None:
+                if recog_likeType(company_contact_num)=="mobile":
+                    mobile_no = company_contact_num
+                else:
+                    phone_no = company_contact_num
+            mail = ''
+            if phone_no!="" or mobile_no!="":
+                list_json.append(getContactjson(company,contact_person,mobile_no,phone_no,mail))
+        sql = "insert into bxkc.company_contact_json(json_contact) values ('"+"'),('".join(list_json)+"')"
+        cursor_target.execute(sql)
+        rows = cursor_source.fetchmany(50000)
+        _count += len(rows)
+        print(_count)
+
+    conn_target.commit()
+
+if __name__=="__main__":
+    transform_contact()

+ 0 - 0
jobs/__init__.py


BIN
jobs/data/2021-04-30_135826_周五医疗数据导出.xlsx


BIN
jobs/data/2021-0年2021-05-02至2021-05-08医疗数据导出.xlsx


+ 38 - 0
jobs/exportJobs.py

@@ -0,0 +1,38 @@
+#coding:utf8
+
+from utils.Utils import sendEmail,getCurrent_date
+import os
+import datetime
+import time
+from export.exportDocument import exportDocument_medicine,list_df_columns
+import pandas as pd
+
+from apscheduler.schedulers.blocking import BlockingScheduler
+
+def export_medicine_friday():
+    current_date = getCurrent_date("%Y-%m-%d")
+    start_time = time.strftime("%Y-%m-%d",time.localtime(time.mktime(time.localtime())-6*24*60*60))
+    if current_date<="2022-04-25":
+        if datetime.datetime.now().weekday()==4:
+            df_data = exportDocument_medicine(start_time,current_date)
+            df = pd.DataFrame(df_data)
+
+            filename = os.path.dirname(__file__)+"/data/%s年%s至%s医疗数据导出.xlsx"%(start_time[:-4],start_time,current_date)
+            df.to_excel(filename,columns=['公告标题', '公告类别', '省份', '城市', '发布时间', '项目编号', '招标单位', '招标联系人', '招标联系人电话', '代理单位', '代理联系人', '代理联系人电话', '比地招标公告地址', '中标单位', '中标金额', '招标金额', '中标单位联系人', '中标单位联系电话'])
+            host = "smtp.exmail.qq.com"
+            username = "vip@bidizhaobiao.com"
+            password = "Biaoxun66-"
+            receivers = ["1985262186@qq.com","1175730271@qq.com","1265797328@qq.com","1289358902@qq.com"]
+            attachs = [filename]
+            sendEmail(host,username,password,receivers,attachs=attachs)
+
+
+def job_medicine_friday():
+    _scheduler = BlockingScheduler()
+    _scheduler.add_job(export_medicine_friday(),"cron",day_of_week='fri',hour=18)
+    _scheduler.start()
+
+
+
+if __name__=="__main__":
+    job_medicine_friday()

+ 0 - 0
mining/__init__.py


+ 0 - 0
test/__init__.py


BIN
test/ab.swf


Những thai đổi đã bị hủy bỏ vì nó quá lớn
+ 1 - 0
test/svg


+ 104 - 0
test/t.py

@@ -0,0 +1,104 @@
+import requests
+import re
+import base64
+from urllib import parse
+headers = {
+    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+                  ' (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
+}
+
+
+def get_script_data():
+    response = requests.get('https://bulletin.cebpubservice.com/biddingBulletin/2020-09-04/3532001.html', headers=headers)
+    print(response.text)
+    arg1 = re.search("arg1='([^']+)'", response.text).group(1)
+    # _0x4818 = re.search('_0x4818=(\[.*?\])', response.text).group(1)
+    # # 转json时需要将单引号替换成双引号,要不然会出错json.decoder.JSONDecodeError: Expecting value: line 1 column 2 (char 1)
+    # _0x4818 = json.loads(_0x4818.encode('latin-1').decode('unicode_escape').replace("'", '"'))
+    return arg1
+
+
+# def rc4():
+#     # base64解码,记得解码
+#     _0x401af1 = base64.b64decode("wqhBH8Knw4TDhSDDgMOdwrjCncOWwphhN8KCGcKqw6dHAU5+wrg2JcKaw4IEJcOcwrRJwoZ0wqF9YgAV").decode()
+#     _0x532ac0 = "jS1Y"
+#     _0x45079a = [''] * 0x100
+#     _0x52d57c = 0x0
+#     _0x3fd789 = ''
+#     _0x4a2aed = '%c2'
+#     _0x124d17 = 0
+#     _0x1b9115 = len(_0x401af1)
+#     while(_0x124d17 < _0x1b9115):
+#         _0x4a2aed += '\x25' + hex(ord(_0x401af1[_0x124d17]))[2:]
+#         _0x124d17 += 1
+#     _0x401af1 = parse.unquote(_0x4a2aed)
+#     for _0x2d67ec in range(0, 0x100):
+#         _0x45079a[_0x2d67ec] = _0x2d67ec
+#
+#     for _0x2d67ec in range(0, 0x100):
+#         _0x52d57c = (_0x52d57c + _0x45079a[_0x2d67ec] + ord(_0x532ac0[_0x2d67ec % len(_0x532ac0)])) % 0x100
+#         _0x105f59 = _0x45079a[_0x2d67ec]
+#         _0x45079a[_0x2d67ec] = _0x45079a[_0x52d57c]
+#         _0x45079a[_0x52d57c] = _0x105f59
+#
+#     _0x2d67ec = 0x0
+#     _0x52d57c = 0x0
+#
+#     for _0x4e5ce2 in range(0, len(_0x401af1)):
+#         _0x2d67ec = (_0x2d67ec + 0x1) % 0x100
+#         _0x52d57c = (_0x52d57c + _0x45079a[_0x2d67ec]) % 0x100
+#         _0x105f59 = _0x45079a[_0x2d67ec]
+#         _0x45079a[_0x2d67ec] = _0x45079a[_0x52d57c]
+#         _0x45079a[_0x52d57c] = _0x105f59
+#         _0x3fd789 += chr(ord(_0x401af1[_0x4e5ce2]) ^ _0x45079a[(_0x45079a[_0x2d67ec] + _0x45079a[_0x52d57c]) % 0x100])
+#
+#     return _0x3fd789
+
+
+def hexXor(_0x4e08d8, _0x23a392):
+    _0x5a5d3b = ''
+    _0xe89588 = 0x0
+    while _0xe89588 < len(_0x23a392) and _0xe89588 < len(_0x4e08d8):
+        _0x401af1 = int(_0x23a392[_0xe89588: _0xe89588 + 0x2], 16)
+        _0x105f59 = int(_0x4e08d8[_0xe89588: _0xe89588 + 0x2], 16)
+
+        _0x189e2c = hex(_0x401af1 ^ _0x105f59)
+        print("1:",_0x23a392[_0xe89588: _0xe89588 + 0x2])
+        print("a",_0x401af1)
+        print("b",_0x105f59)
+        print("c",_0x189e2c)
+        if len(_0x189e2c) == 0x1:
+            _0x189e2c = '\x30' + _0x189e2c
+            print("a-b",_0x189e2c,_0x189e2c)
+        _0x5a5d3b += _0x189e2c[2:]
+
+        _0xe89588 += 0x2
+    return _0x5a5d3b
+
+
+def unsbox(arg):
+    _0x4b082b = [0xf, 0x23, 0x1d, 0x18, 0x21, 0x10, 0x1, 0x26, 0xa, 0x9, 0x13, 0x1f, 0x28, 0x1b, 0x16, 0x17, 0x19, 0xd,
+                 0x6, 0xb, 0x27, 0x12, 0x14, 0x8, 0xe, 0x15, 0x20, 0x1a, 0x2, 0x1e, 0x7, 0x4, 0x11, 0x5, 0x3, 0x1c,
+                 0x22, 0x25, 0xc, 0x24]
+    _0x4da0dc = [''] * 40
+    _0x12605e = ''
+    for _0x20a7bf in range(0, len(arg)):
+        _0x385ee3 = arg[_0x20a7bf]
+        for _0x217721 in range(0, len(_0x4b082b)):
+            if _0x4b082b[_0x217721] == _0x20a7bf + 0x1:
+                _0x4da0dc[_0x217721] = _0x385ee3
+    _0x12605e = ''.join(_0x4da0dc)
+    return _0x12605e
+
+
+if __name__ == '__main__':
+    arg1 = get_script_data()
+    key = '3000176000856006061501533003690027800375'
+    print(arg1)
+    _0x23a392 = unsbox(arg1)
+    print("==",_0x23a392)
+    arg2 = 'acw_sc__v2=' + hexXor(key, _0x23a392)
+    print("=a",arg2)
+    headers['Cookie'] = "5f5209e5a4e508cf886f0fa82cfe1ad5b1c7249c"
+    response = requests.get('https://bulletin.cebpubservice.com/biddingBulletin/2020-09-04/3532001.html', headers=headers)
+    # print(response.text)

+ 110 - 0
test/test.py

@@ -0,0 +1,110 @@
+#encoding:UTF8
+
+from Crypto.Cipher import AES
+import base64
+password = 'zDXjL5mx5HeUgmsg7HyKLg==' #秘钥
+text = '137667617' #需要加密的内容
+model = AES.MODE_CTR #定义模式
+aes = AES.new(password.encode("UTF8"),model) #创建一个aes对象
+
+en_text = aes.encrypt(text.encode("UTF8")) #加密明文
+print(en_text)
+en_text1 = base64.b64encode(en_text) #将返回的字节型数据转进行base64编码
+print(en_text1)
+
+import base64
+from Crypto.Cipher import AES
+from Crypto import Random
+import pandas as pd
+class AESCipher:
+    def __init__(self):
+        '''
+        CBC加密需要一个十六位的key(密钥)和一个十六位iv(偏移量)
+        '''
+        self.key = self.check_key(base64.b64decode('zDXjL5mx5HeUgmsg7HyKLg==') )
+        # 数据块的大小  16位
+        self.BS = 32
+        # CBC模式 相对安全 因为有偏移向量 iv 也是16位字节的
+        self.mode = AES.MODE_ECB
+        # 填充函数 因为AES加密是一段一段加密的  每段都是BS位字节,不够的话是需要自己填充的
+        self.pad = lambda s: s + (self.BS - len(s.encode()) % self.BS)*chr(self.BS - len(s.encode()) % self.BS)
+        # 将填充的数据剔除
+        self.unpad = lambda s: s[:-ord(s[len(s) - 1:])]
+
+    def check_key(self, key):
+        '''
+        检测key的长度是否为16,24或者32bytes的长度
+        '''
+        try:
+            if isinstance(key, bytes):
+                assert len(key) in [16, 24, 32]
+                return key
+            elif isinstance(key, str):
+                assert len(key.encode()) in [16, 24, 32]
+                return key.encode()
+            else:
+                raise Exception('密钥必须为str或bytes,不能为%s'%type(key))
+        except AssertionError:
+            print('输入的长度不正确')
+
+    def check_data(self, data):
+        '''
+        检测加密的数据类型
+        '''
+        if isinstance(data, int):
+            data = str(data)
+        elif isinstance(data, bytes):
+            data = data.decode()
+        elif isinstance(data, str):
+            pass
+        else:
+            raise Exception('加密的数据必须为str或bytes,不能为%s'%type(data))
+        return data
+
+    def text2hex(self,_str):
+        list_hex = []
+        if isinstance(_str,str):
+            _str1 = _str.encode()
+        elif isinstance(_str,bytes):
+            _str1 = _str
+        for _hex in map(hex,_str1):
+            _i = int(_hex[2:],16)
+            while(True):
+                a = _i&0xFF
+                list_hex.append(hex(a)[2:].rjust(2,"0"))
+                _i = _i >>8
+                if _i==0:
+                    break
+        return "".join(list_hex)
+
+    def encrypt(self, raw):
+        raw = self.check_data(raw)
+        raw = self.pad(raw).encode()
+        # 随机获取iv
+        iv = Random.new().read(AES.block_size)
+        # 定义初始化
+        cipher = AES.new(self.key, self.mode)
+        # 此处是将密文和iv一起 base64 解密的时候就可以根据这个iv来解密
+        return self.text2hex(cipher.encrypt(raw))
+        return base64.b64encode(cipher.encrypt(raw)).decode()
+
+    def decrypt(self, enc):
+        # 先将密文进行base64解码
+        enc = base64.b64decode(enc)
+        # 取出iv值
+        iv = enc[:self.BS]
+        # 初始化自定义
+        cipher = AES.new(self.key, self.mode, iv)
+        # 返回utf8格式的数据
+        return self.unpad(cipher.decrypt(enc[self.BS:])).decode()
+
+text = '{"docid":137667617}'  # 待加密文本
+aes = AESCipher()
+print(aes.encrypt(text))
+
+
+print("42\u4e07\u5143")
+
+print(139106361%500+1)
+
+print(679/37643)

+ 69 - 0
test/testAES.py

@@ -0,0 +1,69 @@
+#!/usr/bin/env python35
+# -*- coding: utf-8 -*-
+# Created by wyy on 2018/3/20 13:48.
+
+import base64
+import json
+import re
+
+from Crypto.Cipher import AES
+
+#http://tool.chacuo.net/cryptaes
+class AESECB:
+    def __init__(self, key):
+        self.key = key
+        self.mode = AES.MODE_ECB
+
+        self.bs = 16  # block size
+        #self.PADDING = lambda s: s + (self.bs - len(s) % self.bs) * chr(self.bs - len(s) % self.bs)只支持英文入参
+        self.PADDING = lambda s: s + (self.bs - len(s.encode('gbk')) % self.bs) * chr(self.bs - len(s.encode('gbk')) % self.bs)
+
+    def encrypt(self, text):
+
+        aes = AES.new(self.key, self.mode)  # ECB模式无需向量iv
+        #encrypt_aes = aes.encrypt((self.PADDING(text)).encode(encoding="utf-8"))
+        encrypt_aes = aes.encrypt((self.PADDING(text)).encode(encoding="gbk"))
+        # encrypted_base64 = base64.b64encode(encrypt_aes)
+        #print(encrypted_base64)
+        #return str(encrypted_base64, encoding='utf-8')
+
+        return encrypt_aes
+
+
+    def decrypt(self, text):
+        aes = AES.new(str.encode(self.key), self.mode)  # ECB模式无需向量iv
+        text += (len(text) % 4) * '='
+        encrypt_aes = base64.b64decode(text)
+        data = aes.decrypt(encrypt_aes)
+
+        # 去除解码后的非法字符data.decode()
+        try:
+            result = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f\n\r\t]').sub('', str(data, encoding='utf-8'))
+        except Exception:
+            result = '解码失败,请重试!'
+        return result
+
+def text2hex(_str):
+    list_hex = []
+    if isinstance(_str,str):
+        _str1 = _str.encode()
+    elif isinstance(_str,bytes):
+        _str1 = _str
+    for _hex in map(hex,_str1):
+        _i = int(_hex[2:],16)
+        while(True):
+            a = _i&0xFF
+            list_hex.append(hex(a)[2:].rjust(2,"0"))
+            _i = _i >>8
+            if _i==0:
+                break
+    return "".join(list_hex)
+
+if __name__ == '__main__':
+    key = base64.b64decode('zDXjL5mx5HeUgmsg7HyKLg==')  # 密钥长度必须为16、24或32位,分别对应AES-128、AES-192和AES-256
+    text = "137667617"  # 待加密文本
+    aes = AESECB(key)
+    #print(aes.encrypt('wyy1221wyy1221'))
+    #print(aes.decrypt('m9RKpQSCrZ6fF7RuPoyNLA=='))
+
+    print(text2hex(aes.encrypt(text)))

+ 14 - 0
test/testswf.py

@@ -0,0 +1,14 @@
+
+from swf.movie import SWF
+from swf.export import SVGExporter
+
+file = open("ab.swf",'rb')
+
+_swf = SWF(file)
+svg_exporter = SVGExporter()
+
+# export!
+svg = _swf.export(svg_exporter)
+
+# save the SVG
+open('svg', 'wb').write(svg.read())

+ 124 - 0
utils/GdbDataRemover.py

@@ -0,0 +1,124 @@
+"""
+ File:   GdbDataRemover.py
+ Authors:
+   Mobing
+      2019/7/1 - initial release
+"""
+
+from __future__ import print_function
+import argparse
+
+from gremlin_python.driver import client
+from gremlin_python.driver.resultset import ResultSet
+
+
+class PColors:
+
+    RED = '\033[91m'
+    GREEN = '\033[92m'
+    YELLOW = '\033[0;32m'
+    BLUE = '\033[94m'
+    ENDC = '\033[0m'
+
+    def __init__(self):
+        pass
+
+
+class PrintUtil:
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def rprint(msg):
+        print(PColors.RED + msg + PColors.ENDC)
+
+    @staticmethod
+    def yprint(msg, new_line=True):
+        print(PColors.YELLOW + msg + PColors.ENDC, end="\n" if new_line else "\r")
+
+
+class GdbDataRemover:
+    def __init__(self, gdb_client, limit):
+        self.gdb_client = gdb_client
+        self.limit = limit
+
+    def drop(self, label, drop_edge_only):
+        if label is None:
+            self.__drop_all(True)
+            if not drop_edge_only:
+                self.__drop_all(False)
+        else:
+            self.__drop_by_label(label, drop_edge_only)
+
+    def __drop_all(self, drop_edge_only):
+        marker = "E" if drop_edge_only else "V"
+        cnt_dsl = "g.%s().count()" % marker
+        cnt_params = {}
+        drop_dsl = "g.%s().limit(limit).sideEffect(drop()).count()" % marker
+        drop_params = {
+            "limit": self.limit,
+        }
+        print_marker = "edges" if drop_edge_only else "vertices"
+        PrintUtil.rprint("Start to remove all %s: " % print_marker)
+        self.__generic_batch_drop(cnt_dsl, cnt_params,
+                                  drop_dsl, drop_params)
+
+    def __drop_by_label(self, label, drop_edge_only):
+        marker = "E" if drop_edge_only else "V"
+        label_cnt_dsl = "g.%s().hasLabel(drop_label).count()" % marker
+        label_cnt_params = {
+            "drop_label": label,
+        }
+        label_drop_dsl = "g.%s().hasLabel(drop_label).limit(limit).sideEffect(drop()).count()" % marker
+        label_drop_params = {
+            "drop_label": label,
+            "limit": self.limit,
+        }
+
+        print_marker = "edges" if drop_edge_only else "vertices"
+        PrintUtil.rprint("Start to remove all %s with label %s: " % (print_marker, label))
+        self.__generic_batch_drop(label_cnt_dsl, label_cnt_params,
+                                  label_drop_dsl, label_drop_params)
+
+    def __generic_batch_drop(self, cnt_dsl, cnt_params, drop_dsl, drop_params):
+        cnt_result = self.gdb_client.submit(cnt_dsl, cnt_params)
+        cnt = cnt_result.one()[0]
+
+        if 0 == cnt:
+            PrintUtil.rprint("total cnt: %d, no need to drop" % cnt)
+            return 0
+        else:
+            PrintUtil.rprint("total cnt: %d, begin to drop" % cnt)
+
+        total_dropped_cnt = 0
+        while cnt > total_dropped_cnt:
+            curr_drop_result = self.gdb_client.submit(drop_dsl, drop_params)  # type: ResultSet
+            curr_dropped_cnt = curr_drop_result.one()[0]
+            total_dropped_cnt += curr_dropped_cnt
+            PrintUtil.yprint("%d" % total_dropped_cnt, False)
+            if 0 == curr_dropped_cnt or self.limit < curr_dropped_cnt:
+                break
+        PrintUtil.yprint("")
+
+        return total_dropped_cnt
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--host', dest="host", type=str, required=True)
+    parser.add_argument('--port', dest="port", type=int, default=8182)
+    parser.add_argument('--username', dest="username", type=str, required=True)
+    parser.add_argument('--password', dest="password", type=str, required=True)
+    parser.add_argument('--limit', dest="limit", type=int, default=500)
+    parser.add_argument('--label', dest="label", type=str, default=None, help="drop element with specified label")
+    parser.add_argument('--edge', dest="drop_edge_only", action="store_true", help="only drop edge")
+    args = parser.parse_args()
+    print(args)
+    # gdb_client = client.Client('ws://%s:%d/gremlin' % (args.host, args.port),
+    #                            'g', username=args.username, password=args.password)
+    gdb_client = client.Client('ws://gds-bp130d7rgd9m7n61150070pub.graphdb.rds.aliyuncs.com:3734/gremlin', 'g', username="bxkc", password="k0n1bxkc!0K^Em%j")
+    gdb_data_remover = GdbDataRemover(gdb_client, args.limit)
+    gdb_data_remover.drop(args.label, args.drop_edge_only)
+
+if __name__ == '__main__':
+    main()

+ 149 - 0
utils/MultiHandler.py

@@ -0,0 +1,149 @@
+from multiprocessing import Queue,Process
+import traceback
+import threading
+import time
+from queue import Empty
+from BiddingKG.dl.common.multiThread import MultiThreadHandler,stop_thread
+import sys
+
+class TaskHandler(threading.Thread):
+
+    def __init__(self,task_queue,result_queue,*args,**kwargs):
+        threading.Thread.__init__(self)
+        self.task_queue = task_queue
+        self.result_queue = result_queue
+        self.args = args
+        self.kwargs = kwargs
+
+    def task_handler(self):
+        raise NotImplementedError
+
+    def run(self):
+        while(True):
+            try:
+                if not self.task_queue.empty():
+                    print("task queue size is %d"%(self.task_queue.qsize()))
+                    item = self.task_queue.get(True,timeout=1)
+                    self.task_handler(item,self.result_queue,*self.args,**self.kwargs)
+                else:
+                    print("%s thread is done"%(self.name))
+                    break
+            except Empty as e:
+                print("%s thread is done"%(self.name))
+                break
+            except Exception as e:
+                print("error: %s"%(e))
+                print(traceback.format_exc())
+
+
+# class MultiThreadHandler(object):
+#
+#
+#     def __init__(self,task_queue,Task_handler,result_queue,thread_count=1,*args,**kwargs):
+#         self.task_queue = task_queue
+#         self.Task_handler = Task_handler
+#         self.result_queue = result_queue
+#         self.list_thread = []
+#         self.thread_count = thread_count
+#
+#     def run(self):
+#         for i in range(self.thread_count):
+#             th = self.Task_handler(self.task_queue,self.task_handler,self.result_queue)
+#             self.list_thread.append(th)
+#
+#         for th in self.list_thread:
+#             th.start()
+#
+#         while(not self._check_all_done()):
+#             try:
+#                 time.sleep(1)
+#             except KeyboardInterrupt:
+#                 print("interrupted by keyboard")
+#                 self.stop_all()
+#                 break
+#
+#
+#     def _check_all_done(self):
+#         bool_done = True
+#         for th in self.list_thread:
+#             if th.isAlive():
+#                 bool_done = False
+#         return bool_done
+#
+#     def stop_all(self):
+#         for th in self.list_thread:
+#             th.stop()
+
+
+
+
+def test_handler(item,result_queue):
+    item["change"] += 1
+    result_queue.put(item)
+
+class MultiHandler():
+
+    def __init__(self,task_queue,task_handler,result_queue,process_count=1,thread_count=1,*args,**kwargs):
+        self.task_queue = task_queue
+        self.task_handler = task_handler
+        self.result_queue = result_queue
+        self.process_count = process_count
+        self.thread_count = thread_count
+
+    def processHandler(self,processId,*args,**kwargs):
+        threadHandler = MultiThreadHandler(self.task_queue,self.task_handler,self.result_queue,self.thread_count)
+        threadHandler.run()
+        print("process %s is done"%processId)
+
+    def run(self):
+        self.list_process = []
+        for i in range(self.process_count):
+            p = Process(target=self.processHandler,args=("process-%d"%(i),""))
+            self.list_process.append(p)
+        for p in self.list_process:
+            p.start()
+        while(not self._check_all_done()):
+            try:
+                time.sleep(1)
+                _quit = False
+                line = sys.stdin.readline()
+                if line.strip()=="quit":
+                    _quit = True
+                if _quit:
+                    break
+            except KeyboardInterrupt:
+                print("interrupted by keyboard")
+                self.stop_all()
+                break
+        print("the whole process is done")
+
+    def _check_all_done(self):
+        bool_done = True
+        for th in self.list_process:
+            if th.is_alive():
+                bool_done = False
+        return bool_done
+
+    def stop_all(self):
+        for th in self.list_process:
+            if th.is_alive:
+                stop_thread(th)
+
+if __name__=="__main__":
+    list_i = []
+    task_queue = Queue()
+    result_queue = Queue()
+    for i in range(100):
+        _dict = {"source":i,"change":i}
+        list_i.append(_dict)
+        task_queue.put(_dict)
+    # a = MultiThreadHandler(task_queue,test_handler,result_queue,thread_count=3)
+    a = MultiHandler(task_queue,test_handler,result_queue,process_count=3,thread_count=3)
+    a.run()
+    while(True):
+        try:
+            item = result_queue.get(block=True,timeout=1)
+            print(item)
+        except Exception as e:
+            print(traceback.format_exc())
+            break

+ 942 - 0
utils/Utils.py

@@ -0,0 +1,942 @@
+'''
+Created on 2018年12月20日
+
+@author: User
+'''
+
+import numpy as np
+import re
+import gensim
+from keras import backend as K
+import os
+
+from threading import RLock
+
+from pai_tf_predict_proto import tf_predict_pb2
+import requests
+
+import time
+
+import smtplib
+from email.mime.application import MIMEApplication
+from email.mime.multipart import MIMEMultipart
+from email.utils import formataddr
+
+
+model_w2v = None
+lock_model_w2v = RLock()
+
+USE_PAI_EAS = False
+
+Lazy_load = False
+
+ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
+
+import traceback
+
+def sendEmail(host,username,password,receivers,attachs=[]):
+    try:
+        #处理附件
+        msg = MIMEMultipart()
+        msg["From"] = formataddr(["广州比地数据科技有限公司",username])
+        msg["To"] = formataddr(["客户",receivers[0]])
+        msg["Subject"] = "数据导出服务"
+        for at in attachs:
+            xlsfile = MIMEApplication(open(at,"rb").read())
+            xlsfile.add_header("Content-Disposition","attachment",filename=('gbk', '', at.split("/")[-1]))
+            log(at.split("/")[-1])
+            msg.attach(xlsfile)
+        server = smtplib.SMTP()
+        server.connect(host,25)
+        server.login(username,password)
+        server.sendmail(username,receivers,msg.as_string())
+        log("发送邮件成功%s"%str(attachs))
+        server.close()
+
+    except Exception as e:
+        traceback.print_exc()
+        log("发送邮件错误%s"%str(e))
+
+def getLegal_str(_str):
+    if _str is not None:
+        return ILLEGAL_CHARACTERS_RE.sub("",str(_str))
+
+def getRow_ots_primary(row):
+    _dict = dict()
+    if row is None:
+        return _dict
+    for part in row.attribute_columns:
+        _dict[part[0]] = part[1]
+    for part in row.primary_key:
+        _dict[part[0]] = part[1]
+    return _dict
+
+def getRow_ots(rows):
+    list_dict = []
+    for row in rows:
+        _dict = dict()
+        for part in row:
+            for v in part:
+                _dict[v[0]] = v[1]
+        list_dict.append(_dict)
+    return list_dict
+
+def getw2vfilepath():
+    w2vfile = os.path.dirname(__file__)+"/../wiki_128_word_embedding_new.vector"
+    if os.path.exists(w2vfile):
+        return w2vfile
+    return "wiki_128_word_embedding_new.vector"
+
+def getLazyLoad():
+    global Lazy_load
+    return Lazy_load
+
+def get_file_name(url, headers):
+    filename = ''
+    if 'Content-Disposition' in headers and headers['Content-Disposition']:
+        disposition_split = headers['Content-Disposition'].split(';')
+        if len(disposition_split) > 1:
+            if disposition_split[1].strip().lower().startswith('filename='):
+                file_name = disposition_split[1].split('=')
+                if len(file_name) > 1:
+                    filename = file_name[1]
+    if not filename and os.path.basename(url):
+        filename = os.path.basename(url).split("?")[0]
+    if not filename:
+        return time.time()
+    return filename
+
+model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector"
+model_word = None
+lock_model_word = RLock()
+
+from decimal import Decimal
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+import pickle
+import os
+
+import json
+
+#自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+
+    def __init__(self):
+        import numpy as np
+        global np
+
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        elif isinstance(obj,(np.int64,np.int32)):
+            return int(obj)
+        return json.JSONEncoder.default(self, obj)
+
+vocab_word = None
+vocab_words = None
+
+file_vocab_word = "vocab_word.pk"
+file_vocab_words = "vocab_words.pk"
+
+selffool_authorization = "NjlhMWFjMjVmNWYyNzI0MjY1OGQ1M2Y0ZmY4ZGY0Mzg3Yjc2MTVjYg=="
+selffool_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_gpu"
+selffool_seg_authorization = "OWUwM2Q0ZmE3YjYxNzU4YzFiMjliNGVkMTA3MzJkNjQ2MzJiYzBhZg=="
+selffool_seg_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_seg_gpu"
+codename_authorization = "Y2M5MDUxMzU1MTU4OGM3ZDk2ZmEzYjkxYmYyYzJiZmUyYTgwYTg5NA=="
+codename_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codename_gpu"
+
+form_item_authorization = "ODdkZWY1YWY0NmNhNjU2OTI2NWY4YmUyM2ZlMDg1NTZjOWRkYTVjMw=="
+form_item_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/form"
+person_authorization = "N2I2MDU2N2Q2MGQ0ZWZlZGM3NDkyNTA1Nzc4YmM5OTlhY2MxZGU1Mw=="
+person_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/person"
+role_authorization = "OWM1ZDg5ZDEwYTEwYWI4OGNjYmRlMmQ1NzYwNWNlZGZkZmRmMjE4OQ=="
+role_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/role"
+money_authorization = "MDQyNjc2ZDczYjBhYmM4Yzc4ZGI4YjRmMjc3NGI5NTdlNzJiY2IwZA=="
+money_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/money"
+codeclasses_authorization = "MmUyNWIxZjQ2NjAzMWJlMGIzYzkxMjMzNWY5OWI3NzJlMWQ1ZjY4Yw=="
+codeclasses_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codeclasses"
+
+def viterbi_decode(score, transition_params):
+    """Decode the highest scoring sequence of tags outside of TensorFlow.
+
+    This should only be used at test time.
+
+    Args:
+      score: A [seq_len, num_tags] matrix of unary potentials.
+      transition_params: A [num_tags, num_tags] matrix of binary potentials.
+
+    Returns:
+      viterbi: A [seq_len] list of integers containing the highest scoring tag
+          indices.
+      viterbi_score: A float containing the score for the Viterbi sequence.
+    """
+    trellis = np.zeros_like(score)
+    backpointers = np.zeros_like(score, dtype=np.int32)
+    trellis[0] = score[0]
+
+    for t in range(1, score.shape[0]):
+        v = np.expand_dims(trellis[t - 1], 1) + transition_params
+        trellis[t] = score[t] + np.max(v, 0)
+        backpointers[t] = np.argmax(v, 0)
+
+    viterbi = [np.argmax(trellis[-1])]
+    for bp in reversed(backpointers[1:]):
+        viterbi.append(bp[viterbi[-1]])
+    viterbi.reverse()
+
+    viterbi_score = np.max(trellis[-1])
+    return viterbi, viterbi_score
+
+import ctypes
+import inspect
+
+def _async_raise(tid, exctype):
+    """raises the exception, performs cleanup if needed"""
+    tid = ctypes.c_long(tid)
+    if not inspect.isclass(exctype):
+        exctype = type(exctype)
+    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
+    if res == 0:
+        raise ValueError("invalid thread id")
+    elif res != 1:
+        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
+        raise SystemError("PyThreadState_SetAsyncExc failed")
+
+def stop_thread(thread):
+    _async_raise(thread.ident, SystemExit)
+
+def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024):
+    len_sample = 0
+    if len(feed_dict.keys())>0:
+        len_sample = len(feed_dict[list(feed_dict.keys())[0]])
+    if len_sample>MAX_BATCH:
+        list_result = [[] for _ in range(len(list_output))]
+        _begin = 0
+        while(_begin<len_sample):
+            new_dict = dict()
+            for _key in feed_dict.keys():
+                new_dict[_key] = feed_dict[_key][_begin:_begin+MAX_BATCH]
+            _output = sess.run(list_output,feed_dict=new_dict)
+            for _index in range(len(list_output)):
+                list_result[_index].extend(_output[_index])
+            _begin += MAX_BATCH
+    else:
+        list_result = sess.run(list_output,feed_dict=feed_dict)
+    return list_result
+
+
+
+def get_values(response,output_name):
+        """
+        Get the value of a specified output tensor
+        :param output_name: name of the output tensor
+        :return: the content of the output tensor
+        """
+        output = response.outputs[output_name]
+        if output.dtype == tf_predict_pb2.DT_FLOAT:
+            _value = output.float_val
+        elif output.dtype == tf_predict_pb2.DT_INT8 or output.dtype == tf_predict_pb2.DT_INT16 or \
+                output.dtype == tf_predict_pb2.DT_INT32:
+            _value = output.int_val
+        elif output.dtype == tf_predict_pb2.DT_INT64:
+            _value = output.int64_val
+        elif output.dtype == tf_predict_pb2.DT_DOUBLE:
+            _value = output.double_val
+        elif output.dtype == tf_predict_pb2.DT_STRING:
+            _value = output.string_val
+        elif output.dtype == tf_predict_pb2.DT_BOOL:
+            _value = output.bool_val
+        return np.array(_value).reshape(response.outputs[output_name].array_shape.dim)
+
+def vpc_requests(url,authorization,request_data,list_outputs):
+    
+    
+    headers = {"Authorization": authorization}
+    dict_outputs = dict()
+    
+    response = tf_predict_pb2.PredictResponse()
+    resp = requests.post(url, data=request_data, headers=headers)
+    
+    
+    if resp.status_code != 200:
+        print(resp.status_code,resp.content)
+        log("调用pai-eas接口出错,authorization:"+str(authorization))
+        return None
+    else:
+        response = tf_predict_pb2.PredictResponse()
+        response.ParseFromString(resp.content)
+        for _output in list_outputs:
+            dict_outputs[_output] = get_values(response, _output)
+        return dict_outputs
+
+def encodeInput(data,word_len,word_flag=True,userFool=False):
+    result = []
+    out_index = 0
+    for item in data:
+        if out_index in [0]:
+            list_word = item[-word_len:]
+        else:
+            list_word = item[:word_len]
+        temp = []
+        if word_flag:
+            for word in list_word:
+                if userFool:
+                    temp.append(getIndexOfWord_fool(word))
+                else:
+                    temp.append(getIndexOfWord(word))
+            list_append = []
+            temp_len = len(temp)
+            while(temp_len<word_len):
+                if userFool:
+                    list_append.append(0)
+                else:
+                    list_append.append(getIndexOfWord("<pad>"))
+                temp_len += 1
+            if out_index in [0]:
+                temp = list_append+temp
+            else:
+                temp = temp+list_append
+        else:
+            for words in list_word:
+                temp.append(getIndexOfWords(words))
+                
+            list_append = []
+            temp_len = len(temp)
+            while(temp_len<word_len):
+                list_append.append(getIndexOfWords("<pad>"))
+                temp_len += 1
+            if out_index in [0,1]:
+                temp = list_append+temp
+            else:
+                temp = temp+list_append
+        result.append(temp)
+        out_index += 1
+    return result
+
+def encodeInput_form(input,MAX_LEN=30):
+    x = np.zeros([MAX_LEN])
+    for i in range(len(input)):
+        if i>=MAX_LEN:
+            break
+        x[i] = getIndexOfWord(input[i])
+    return x
+    
+
+def getVocabAndMatrix(model,Embedding_size = 60):
+    '''
+    @summary:获取子向量的词典和子向量矩阵
+    '''
+    vocab = ["<pad>"]+model.index2word
+    
+    embedding_matrix = np.zeros((len(vocab),Embedding_size))
+    for i in range(1,len(vocab)):
+        embedding_matrix[i] = model[vocab[i]]
+    
+    return vocab,embedding_matrix
+
+def getIndexOfWord(word):
+    global vocab_word,file_vocab_word
+    if vocab_word is None:
+        if os.path.exists(file_vocab_word):
+            vocab = load(file_vocab_word)
+            vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
+        else:
+            model = getModel_word()
+            vocab,_ = getVocabAndMatrix(model, Embedding_size=60)
+            vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
+            save(vocab,file_vocab_word)
+    if word in vocab_word.keys():
+        return vocab_word[word]
+    else:
+        return vocab_word['<pad>']
+        
+def getIndexOfWords(words):
+    global vocab_words,file_vocab_words
+    if vocab_words is None:
+        if os.path.exists(file_vocab_words):
+            vocab = load(file_vocab_words)
+            vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
+        else:
+            model = getModel_w2v()
+            vocab,_ = getVocabAndMatrix(model, Embedding_size=128)
+            vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
+            save(vocab,file_vocab_words)
+    if words in vocab_words.keys():
+        return vocab_words[words]
+    else:
+        return vocab_words["<pad>"]
+
+def log_tofile(filename):
+    logging.basicConfig(filename=filename,level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    logger = logging.getLogger(__name__)
+
+def log(msg):
+    '''
+    @summary:打印信息
+    '''
+    logger.info(msg)
+
+def debug(msg):
+    '''
+    @summary:打印信息
+    '''
+    logger.debug(msg)
+
+
+def save(object_to_save, path):
+    '''
+    保存对象
+    @Arugs:
+        object_to_save: 需要保存的对象
+
+    @Return:
+        保存的路径
+    '''
+    with open(path, 'wb') as f:
+        pickle.dump(object_to_save, f)
+
+def load(path):
+    '''
+    读取对象
+    @Arugs:
+        path: 读取的路径
+
+    @Return:
+        读取的对象
+    '''
+    with open(path, 'rb') as f:
+        object1 = pickle.load(f)
+        return object1
+    
+
+def getIndexOfWord_fool(word):
+    
+    if word in fool_char_to_id.keys():
+        return fool_char_to_id[word]
+    else:
+        return fool_char_to_id["[UNK]"]
+
+
+def find_index(list_tofind,text):
+    '''
+    @summary: 查找所有词汇在字符串中第一次出现的位置
+    @param:
+        list_tofind:待查找词汇
+        text:字符串
+    @return: list,每个词汇第一次出现的位置
+    
+    '''
+    result = []
+    for item in list_tofind:
+        index = text.find(item)
+        if index>=0:
+            result.append(index)
+        else:
+            result.append(-1)
+    return result
+
+
+def combine(list1,list2):
+    '''
+    @summary:将两个list中的字符串两两拼接
+    @param:
+        list1:字符串list
+        list2:字符串list
+    @return:拼接结果list
+    '''
+    result = []
+    for item1 in list1:
+        for item2 in list2:
+            result.append(str(item1)+str(item2))
+    return result
+
+
+def getDigitsDic(unit):
+    '''
+    @summary:拿到中文对应的数字
+    '''
+    DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
+                 "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
+    return DigitsDic.get(unit)
+
+def getMultipleFactor(unit):
+    '''
+    @summary:拿到单位对应的值
+    '''
+    MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
+    return MultipleFactor.get(unit)
+
+def getUnifyMoney(money):
+    '''
+    @summary:将中文金额字符串转换为数字金额
+    @param:
+        money:中文金额字符串
+    @return: decimal,数据金额
+    '''
+
+
+    MAX_NUM = 12
+    #去掉逗号
+    money = re.sub("[,,]","",money)
+    money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億〇一二三四五六七八九十百千万亿元角分]","",money)
+    result = Decimal(0)
+    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
+    chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"]
+
+    LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
+    BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
+    if re.search(LowMoneypattern,money) is not None:
+        return Decimal(money)
+    elif re.search(BigMoneypattern,money) is not None:
+        return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
+    for factorUnit in chnFactorUnits:
+        if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
+            subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
+            if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None:
+                result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
+            elif len(subMoneys[0])==1:
+                if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
+                    result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
+            else:
+                result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
+
+            if len(subMoneys)>1:
+                if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
+                    result += Decimal(subMoneys[1])
+                elif len(subMoneys[1])==1:
+                    if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
+                        result += Decimal(getDigitsDic(subMoneys[1]))
+                else:
+                    result += Decimal(getUnifyMoney(subMoneys[1]))
+            break
+    return result
+
+
+
+
+def getModel_w2v():
+    '''
+    @summary:加载词向量
+    '''
+    global model_w2v,lock_model_w2v
+    with lock_model_w2v:
+        if model_w2v is None:
+            model_w2v = gensim.models.KeyedVectors.load_word2vec_format(getw2vfilepath(),binary=True)
+        return model_w2v
+
+def getModel_word():
+    '''
+    @summary:加载字向量
+    '''
+
+    global model_word,lock_model_w2v
+    with lock_model_word:
+        if model_word is None:
+            model_word = gensim.models.KeyedVectors.load_word2vec_format(model_word_file,binary=True)
+        return model_word
+
+# getModel_w2v()
+# getModel_word()
+
+def findAllIndex(substr,wholestr):
+    '''
+    @summary: 找到字符串的子串的所有begin_index
+    @param:
+        substr:子字符串
+        wholestr:子串所在完整字符串
+    @return: list,字符串的子串的所有begin_index
+    '''
+    copystr = wholestr
+    result = []
+    indexappend = 0
+    while(True):
+        index = copystr.find(substr)
+        if index<0:
+            break
+        else:
+            result.append(indexappend+index)
+            indexappend += index+len(substr)
+            copystr = copystr[index+len(substr):]
+    return result
+    
+  
+def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag = False,use_text = False,text = None):
+    '''
+    @summary:取得某个实体的上下文词汇
+    @param:
+        tokens:句子分词list
+        begin_index:实体的开始index
+        end_index:实体的结束index
+        size:左右两边各取多少个词
+        center_include:是否包含实体
+        word_flag:词/字,默认是词
+    @return: list,实体的上下文词汇
+    '''  
+    if use_text:
+        assert text is not None
+    length_tokens = len(tokens)
+    if begin_index>size:
+        begin = begin_index-size
+    else:
+        begin = 0
+    if end_index+size<length_tokens:
+        end = end_index+size+1
+    else:
+        end = length_tokens
+    result = []
+    if not word_flag:
+        result.append(tokens[begin:begin_index])
+        if center_include:
+            if use_text:
+                result.append(text)
+            else:
+                result.append(tokens[begin_index:end_index+1])
+        result.append(tokens[end_index+1:end])
+    else:
+        result.append("".join(tokens[begin:begin_index]))
+        if center_include:
+            if use_text:
+                result.append(text)
+            else:
+                result.append("".join(tokens[begin_index:end_index+1]))
+        result.append("".join(tokens[end_index+1:end]))
+    #print(result)
+    return result
+
+#根据规则补全编号或名称两边的符号
+def fitDataByRule(data):
+    symbol_dict = {"(":")",
+                   "(":")",
+                   "[":"]",
+                   "【":"】",
+                   ")":"(",
+                   ")":"(",
+                   "]":"[",
+                   "】":"【"}
+    leftSymbol_pattern = re.compile("[\((\[【]")
+    rightSymbol_pattern = re.compile("[\))\]】]")
+    leftfinds = re.findall(leftSymbol_pattern,data)
+    rightfinds = re.findall(rightSymbol_pattern,data)
+    result = data
+    if len(leftfinds)+len(rightfinds)==0:
+        return data
+    elif len(leftfinds)==len(rightfinds):
+        return data
+    elif abs(len(leftfinds)-len(rightfinds))==1:
+        if len(leftfinds)>len(rightfinds):
+            if symbol_dict.get(data[0]) is not None:
+                result = data[1:]
+            else:
+                #print(symbol_dict.get(leftfinds[0]))
+                result = data+symbol_dict.get(leftfinds[0])
+        else:
+            if symbol_dict.get(data[-1]) is not None:
+                result = data[:-1]
+            else:
+                result = symbol_dict.get(rightfinds[0])+data
+    result = re.sub("[。]","",result)
+    return  result
+
+def embedding(datas,shape):
+    '''
+    @summary:查找词汇对应的词向量
+    @param:
+        datas:词汇的list
+        shape:结果的shape
+    @return: array,返回对应shape的词嵌入
+    '''
+    model_w2v = getModel_w2v()
+    embed = np.zeros(shape)
+    length = shape[1]
+    out_index = 0
+    #print(datas)
+    for data in datas:
+        index = 0
+        for item in data:
+            item_not_space = re.sub("\s*","",item)
+            if index>=length:
+                break
+            if item_not_space in model_w2v.vocab:
+                embed[out_index][index] = model_w2v[item_not_space]
+                index += 1
+            else:
+                #embed[out_index][index] = model_w2v['unk']
+                index += 1
+        out_index += 1
+    return embed
+
+def embedding_word(datas,shape):
+    '''
+    @summary:查找词汇对应的词向量
+    @param:
+        datas:词汇的list
+        shape:结果的shape
+    @return: array,返回对应shape的词嵌入
+    '''
+    model_w2v = getModel_word()
+    embed = np.zeros(shape)
+    length = shape[1]
+    out_index = 0
+    #print(datas)
+    for data in datas:
+        index = 0
+        for item in str(data)[-shape[1]:]:
+            if index>=length:
+                break
+            if item in model_w2v.vocab:
+                embed[out_index][index] = model_w2v[item]
+                index += 1
+            else:
+                # embed[out_index][index] = model_w2v['unk']
+                index += 1
+        out_index += 1
+    return embed
+
+def formEncoding(text,shape=(100,60),expand=False):
+    embedding = np.zeros(shape)
+    word_model = getModel_word()
+    for i in range(len(text)):
+        if i>=shape[0]:
+            break
+        if text[i] in word_model.vocab:
+            embedding[i] = word_model[text[i]]
+    if expand:
+        embedding = np.expand_dims(embedding,0)
+    return embedding
+
+def partMoney(entity_text,input2_shape = [7]):
+    '''
+    @summary:对金额分段
+    @param:
+        entity_text:数值金额
+        input2_shape:分类数
+    @return: array,分段之后的独热编码
+    '''
+    money = float(entity_text)
+    parts = np.zeros(input2_shape)
+    if money<100:
+        parts[0] = 1
+    elif money<1000:
+        parts[1] = 1
+    elif money<10000:
+        parts[2] = 1
+    elif money<100000:
+        parts[3] = 1
+    elif money<1000000:
+        parts[4] = 1
+    elif money<10000000:
+        parts[5] = 1
+    else:
+        parts[6] = 1
+    return parts
+
+def recall(y_true, y_pred):
+    '''
+    计算召回率
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        召回率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    if c3 == 0:
+        return 0
+    recall = c1 / c3
+    return recall
+
+
+def f1_score(y_true, y_pred):
+    '''
+    计算F1
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        F1值
+    '''
+
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    precision = c1 / c2
+    if c3 == 0:
+        recall = 0
+    else:
+        recall = c1 / c3
+    f1_score = 2 * (precision * recall) / (precision + recall)
+    return f1_score
+
+
+def precision(y_true, y_pred):
+    '''
+    计算精确率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        精确率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = c1 / c2
+    return precision
+
+# def print_metrics(history):
+#     '''
+#     制作每次迭代的各metrics变化图片
+#
+#     @Arugs:
+#         history: 模型训练迭代的历史记录
+#     '''
+#     import matplotlib.pyplot as plt
+#
+#     # loss图
+#     loss = history.history['loss']
+#     val_loss = history.history['val_loss']
+#     epochs = range(1, len(loss) + 1)
+#     plt.subplot(2, 2, 1)
+#     plt.plot(epochs, loss, 'bo', label='Training loss')
+#     plt.plot(epochs, val_loss, 'b', label='Validation loss')
+#     plt.title('Training and validation loss')
+#     plt.xlabel('Epochs')
+#     plt.ylabel('Loss')
+#     plt.legend()
+#
+#     # f1图
+#     f1 = history.history['f1_score']
+#     val_f1 = history.history['val_f1_score']
+#     plt.subplot(2, 2, 2)
+#     plt.plot(epochs, f1, 'bo', label='Training f1')
+#     plt.plot(epochs, val_f1, 'b', label='Validation f1')
+#     plt.title('Training and validation f1')
+#     plt.xlabel('Epochs')
+#     plt.ylabel('F1')
+#     plt.legend()
+#
+#     # precision图
+#     prec = history.history['precision']
+#     val_prec = history.history['val_precision']
+#     plt.subplot(2, 2, 3)
+#     plt.plot(epochs, prec, 'bo', label='Training precision')
+#     plt.plot(epochs, val_prec, 'b', label='Validation pecision')
+#     plt.title('Training and validation precision')
+#     plt.xlabel('Epochs')
+#     plt.ylabel('Precision')
+#     plt.legend()
+#
+#     # recall图
+#     recall = history.history['recall']
+#     val_recall = history.history['val_recall']
+#     plt.subplot(2, 2, 4)
+#     plt.plot(epochs, recall, 'bo', label='Training recall')
+#     plt.plot(epochs, val_recall, 'b', label='Validation recall')
+#     plt.title('Training and validation recall')
+#     plt.xlabel('Epochs')
+#     plt.ylabel('Recall')
+#     plt.legend()
+#
+#     plt.show()
+
+import pandas as pd
+dict_name_locations = {}
+dict_id_location = {}
+def getLocationDict():
+    global dict_name_locations,dict_id_location
+    df = pd.read_excel(os.path.dirname(__file__)+"/省份信息.xlsx")
+    for _id,_cname,_parentid,_ctype in zip(df["id"],df["cname"],df["parentid"],df["ctype"]):
+        _dict = {"id":_id,"cname":_cname,"parentid":_parentid,"ctype":_ctype}
+        dict_id_location[_id] =_dict
+        if _cname not in dict_name_locations:
+            dict_name_locations[_cname] = []
+        dict_name_locations[_cname].append(_dict)
+
+getLocationDict()
+
+def getProvinceCityDistrict(loc):
+    list_loc = dict_name_locations.get(loc,[])
+    list_result = []
+    for _loc in list_loc:
+        dict_loc_parents = {}
+        _current_loc = _loc
+        while(True):
+            if _current_loc is None:
+                break
+            if _current_loc.get("ctype")>=20:
+                dict_loc_parents[_current_loc.get("ctype")] = _current_loc
+            _current_loc = dict_id_location.get(_current_loc.get("parentid"))
+        if len(dict_loc_parents.keys())>0:
+            list_result.append(dict_loc_parents)
+    return list_result
+
+def chooseLocation(list_result):
+    province = ""
+    city = ""
+    district = ""
+    dict_province = {}
+    for _dict in list_result:
+        province = _dict.get(20,{}).get("cname","")
+        if province!="":
+            if province not in dict_province:
+                dict_province[province] = 0
+            dict_province[province] += 1
+    max_province = ""
+    max_province_count = 0
+    for k,v in dict_province.items():
+        if v>max_province_count:
+            max_province = k
+            max_province_count = v
+
+    if len(list_result)>0:
+        list_result.sort(key=lambda x:len(list(x.keys())),reverse=True)
+        for _dict in list_result:
+            _province = _dict.get(20,{}).get("cname","")
+            if _province!=max_province:
+                continue
+            province = _province
+            city = _dict.get(30,{}).get("cname","")
+            district = _dict.get(40,{}).get("cname","")
+            break
+    return province,city,district
+
+def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
+    _time = time.strftime(format,time.localtime())
+    return _time
+
+def getLocation(_str):
+    list_names = list(dict_name_locations.keys())
+    name_pattern = "(?P<locations>%s)"%"|".join(list_names)
+    name_pattern_subcompany = "(%s)分"%name_pattern
+    list_result = []
+    for _iter in re.finditer(name_pattern_subcompany,_str):
+        _loc = _iter.groupdict().get("locations")
+        list_result.extend(getProvinceCityDistrict(_loc))
+    if len(list_result)>0:
+        return chooseLocation(list_result)
+    for _iter in re.finditer(name_pattern,_str):
+        _loc = _iter.groupdict().get("locations")
+        list_result.extend(getProvinceCityDistrict(_loc))
+    return chooseLocation(list_result)
+
+
+
+
+if __name__=="__main__":
+    print(getLocation("佛山市顺德区顺控路桥投资有限公司"))
+    # print(fool_char_to_id[">"])
+    # model = getModel_w2v()
+    # vocab,matrix = getVocabAndMatrix(model, Embedding_size=128)
+    # save([vocab,matrix],"vocabMatrix_words.pk")
+    pass

+ 0 - 0
utils/__init__.py


+ 89 - 0
utils/hashUtil.py

@@ -0,0 +1,89 @@
+#encoding:utf8
+
+import base64
+from Crypto.Cipher import AES
+from Crypto import Random
+import pandas as pd
+class AESCipher:
+    def __init__(self):
+        '''
+        CBC加密需要一个十六位的key(密钥)和一个十六位iv(偏移量)
+        '''
+        self.key = self.check_key(base64.b64decode('zDXjL5mx5HeUgmsg7HyKLg==') )
+        # 数据块的大小  16位
+        self.BS = 32
+        # CBC模式 相对安全 因为有偏移向量 iv 也是16位字节的
+        self.mode = AES.MODE_ECB
+        # 填充函数 因为AES加密是一段一段加密的  每段都是BS位字节,不够的话是需要自己填充的
+        self.pad = lambda s: s + (self.BS - len(s.encode()) % self.BS)*chr(self.BS - len(s.encode()) % self.BS)
+        # 将填充的数据剔除
+        self.unpad = lambda s: s[:-ord(s[len(s) - 1:])]
+
+    def check_key(self, key):
+        '''
+        检测key的长度是否为16,24或者32bytes的长度
+        '''
+        try:
+            if isinstance(key, bytes):
+                assert len(key) in [16, 24, 32]
+                return key
+            elif isinstance(key, str):
+                assert len(key.encode()) in [16, 24, 32]
+                return key.encode()
+            else:
+                raise Exception('密钥必须为str或bytes,不能为%s'%type(key))
+        except AssertionError:
+            print('输入的长度不正确')
+
+    def check_data(self, data):
+        '''
+        检测加密的数据类型
+        '''
+        if isinstance(data, int):
+            data = str(data)
+        elif isinstance(data, bytes):
+            data = data.decode()
+        elif isinstance(data, str):
+            pass
+        else:
+            raise Exception('加密的数据必须为str或bytes,不能为%s'%type(data))
+        return data
+
+    def text2hex(self,_str):
+        list_hex = []
+        if isinstance(_str,str):
+            _str1 = _str.encode()
+        elif isinstance(_str,bytes):
+            _str1 = _str
+        for _hex in map(hex,_str1):
+            _i = int(_hex[2:],16)
+            while(True):
+                a = _i&0xFF
+                list_hex.append(hex(a)[2:].rjust(2,"0"))
+                _i = _i >>8
+                if _i==0:
+                    break
+        return "".join(list_hex)
+
+    def encrypt(self, raw):
+        raw = self.check_data(raw)
+        raw = self.pad(raw).encode()
+        # 随机获取iv
+        iv = Random.new().read(AES.block_size)
+        # 定义初始化
+        cipher = AES.new(self.key, self.mode)
+        # 此处是将密文和iv一起 base64 解密的时候就可以根据这个iv来解密
+        return self.text2hex(cipher.encrypt(raw))
+        return base64.b64encode(cipher.encrypt(raw)).decode()
+
+    def decrypt(self, enc):
+        # 先将密文进行base64解码
+        enc = base64.b64decode(enc)
+        # 取出iv值
+        iv = enc[:self.BS]
+        # 初始化自定义
+        cipher = AES.new(self.key, self.mode, iv)
+        # 返回utf8格式的数据
+        return self.unpad(cipher.decrypt(enc[self.BS:])).decode()
+
+aesCipher = AESCipher()

+ 108 - 0
utils/multiThread.py

@@ -0,0 +1,108 @@
+
+import threading
+import queue
+import time
+import traceback
+
+import ctypes
+import inspect
+import sys
+
+def _async_raise(tid, exctype):
+    """raises the exception, performs cleanup if needed"""
+    tid = ctypes.c_long(tid)
+    if not inspect.isclass(exctype):
+        exctype = type(exctype)
+    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
+    if res == 0:
+        raise ValueError("invalid thread id")
+    elif res != 1:
+        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
+        raise SystemError("PyThreadState_SetAsyncExc failed")
+
+def stop_thread(thread):
+    _async_raise(thread.ident, SystemExit)
+
+class _taskHandler(threading.Thread):
+
+    def __init__(self,task_queue,task_handler,result_queue,*args,**kwargs):
+        threading.Thread.__init__(self)
+        self.task_queue = task_queue
+        self.task_handler = task_handler
+        self.result_queue = result_queue
+        self.args = args
+        self.kwargs = kwargs
+
+    def run(self):
+        while(True):
+            try:
+                print("task queue size is %d"%(self.task_queue.qsize()))
+                item = self.task_queue.get(False)
+                self.task_handler(item,self.result_queue,*self.args,**self.kwargs)
+                # self.task_queue.task_done()
+            except queue.Empty as e:
+                print("%s thread is done"%(self.name))
+                break
+            except Exception as e:
+                print("error: %s"%(e))
+                print(traceback.format_exc())
+
+class MultiThreadHandler(object):
+
+    def __init__(self,task_queue,task_handler,result_queue,thread_count=1,*args,**kwargs):
+        self.task_queue = task_queue
+        self.task_handler = task_handler
+        self.result_queue = result_queue
+        self.list_thread = []
+        self.thread_count = thread_count
+        self.args = args
+        self.kwargs = kwargs
+
+    def run(self):
+        for i in range(self.thread_count):
+            th = _taskHandler(self.task_queue,self.task_handler,self.result_queue,*self.args,**self.kwargs)
+            th.setDaemon(True)
+            self.list_thread.append(th)
+
+        for th in self.list_thread:
+            th.start()
+
+        while(not self._check_all_done()):
+            try:
+                time.sleep(1)
+                # _quit = False
+                # line = sys.stdin.readline()
+                # if line.strip()=="quit":
+                #     _quit = True
+                # if _quit:
+                #     break
+            except KeyboardInterrupt:
+                print("interrupted by keyboard")
+                self.stop_all()
+                break
+        print("the whole task is done")
+
+
+
+    def _check_all_done(self):
+        bool_done = True
+        for th in self.list_thread:
+            if th.isAlive():
+                bool_done = False
+        return bool_done
+
+    def stop_all(self):
+        for th in self.list_thread:
+            if th.isAlive:
+                stop_thread(th)
+
+def test_handler(item,result_queue):
+    print(item)
+
+if __name__=="__main__":
+    task_queue = queue.Queue()
+    result_queue = queue.Queue()
+    for i in range(100):
+        task_queue.put(i)
+    a = MultiThreadHandler(task_queue=task_queue,task_handler=test_handler,result_queue=result_queue,thread_count=3)
+    a.run()

BIN
utils/省份信息.xlsx


Một số tệp đã không được hiển thị bởi vì quá nhiều tập tin thay đổi trong này khác