--新增规则 根据公告附件进行去重
--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),1 from run_dumplicate_document_his where docchannel in (101,119,120) group by docchannel,doctitle_refine,win_tenderer,win_bid_price having doctitle_refine!="" and doctitle_refine is not NULL and win_tenderer!="" and win_bid_price!="" and count(1)>1;
-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),2 from run_dumplicate_document_his where docchannel in (101,119,120) group by docchannel,project_code,project_name,win_tenderer,win_bid_price having project_code!="" and project_code is not NULL and project_name!="" and project_name is not NULL and win_tenderer!="" and win_bid_price!="" and count(1)>1;
--中标公告 编号 标题 中标人 中标价 站源=1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,tenderee),3 from run_dumplicate_document_his where docchannel in (101,119,120) group by docchannel,project_code,doctitle_refine,win_tenderer,win_bid_price having project_code!="" and project_code is not NULL and doctitle_refine!="" and doctitle_refine is not NULL and win_tenderer!="" and win_bid_price="" and count(1)>1;
--招标 编号 标题 招标人 预算 站源=1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),4 from run_dumplicate_document_his where docchannel='52' group by project_code,doctitle_refine,tenderee,bidding_budget having project_code!="" and project_code is not NULL and doctitle_refine!="" and doctitle_refine is not NULL and tenderee!="" and tenderee is not NULL and bidding_budget!="" and count(1)>1;
-- 同一个招标人同一天采购同一样物品的时候,这个规则就不适用了 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),5 from run_dumplicate_document_his -- where docchannel='52' -- group by project_name,tenderee,bidding_budget -- having project_name!="" and project_name is not NULL -- and tenderee!="" and tenderee is not NULL -- and bidding_budget!="";
--招标公告 编号 名称 预算 站源>1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),5 from run_dumplicate_document_his where docchannel not in (101,118,119,120) group by docchannel,project_code,project_name,bidding_budget having project_name!="" and project_name is not NULL and project_code!="" and project_code is not NULL and bidding_budget!="" and count(1)>1;
-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),6 from run_dumplicate_document_his where docchannel in (52,118) group by docchannel,project_name,agency,bidding_budget having project_name!="" and project_name is not NULL and agency!="" and agency is not NULL and count(1)>1;
-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),7 from run_dumplicate_document_his where docchannel in (52,118) group by docchannel,project_code,agency,bidding_budget having project_code!="" and project_code is not NULL and agency!="" and agency is not NULL and count(1)>1;
-- 7. 非中标公告 - 同项目名称 - 同发布日期 - 同招标人 - 同预算 - 同类型 - 信息源>1 - 同项目编号 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),8 from run_dumplicate_document_his where docchannel not in (101,119,120) group by docchannel,project_name,page_time_stamp,tenderee,bidding_budget,project_code having project_name!="" and project_name is not NULL and page_time_stamp>0 and tenderee!="" and tenderee is not NULL and bidding_budget!="" and project_code!="" and project_code is not NULL and count(1)>1;
-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0) insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,tenderee),9 from run_dumplicate_document_his where docchannel in (101,119,120) group by docchannel,project_code,project_name,win_tenderer,win_bid_price having project_code!="" and project_code is not NULL and project_name!="" and project_name is not NULL and win_tenderer!="" and win_bid_price="" and count(1)>1;
-- 8. 中标公告 - 同项目名称 - 同发布日期 - 同中标人 - 同中标价 - 同类型 - 信息源>1 - 同项目编号 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),10 from run_dumplicate_document_his where docchannel in (101,119,120) group by docchannel,project_name,page_time_stamp,win_tenderer,win_bid_price,project_code having project_name!="" and project_name is not NULL and page_time_stamp>0 and win_tenderer!="" and win_bid_price!="" and project_code!="" and project_code is not NULL and count(1)>1;
-- -- 6. 不同公告类型 - 同原标题- 同日期 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid(docid,page_time_stamp,extract_count,docchannel,2,tenderee),11 from run_dumplicate_document_his -- group by doctitle,page_time_stamp -- having doctitle!="" and doctitle is not NULL -- and page_time_stamp>0 -- and count(1)>1;
-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),12 from run_dumplicate_document_his where docchannel in (52,118) group by docchannel,doctitle_refine,tenderee,bidding_budget having doctitle_refine!="" and doctitle_refine is not NULL and tenderee!="" and tenderee is not NULL and count(1)>1;
-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0) insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),13 from run_dumplicate_document_his where docchannel='52' group by project_code,doctitle_refine,agency,bidding_budget having project_code!="" and project_code is not NULL and doctitle_refine!="" and doctitle_refine is not NULL and agency!="" and agency is not NULL and bidding_budget!="" and count(1)>1;
-- 公告内容完全相同的去重 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,1),0 from run_dumplicate_document_his group by fingerprint having length(fingerprint)>0 and count(1)>1;
-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,tenderee,agency,1,doctitle_refine),35 -- from run_dumplicate_document_his -- group by docchannel,page_time,win_bid_price,bidding_budget -- having length(win_bid_price)>0 -- and length(bidding_budget)>0 -- and count(1)>1 -- ;
-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,tenderee,1,doctitle_refine),36 -- from run_dumplicate_document_his -- group by docchannel,page_time,agency,bidding_budget -- having length(agency)>0 -- and length(bidding_budget)>0 -- and count(1)>1 -- ;
-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,bidding_budget,agency,1,doctitle_refine),37 -- from run_dumplicate_document_his -- group by docchannel,page_time,win_tenderer,win_bid_price -- having length(win_tenderer)>0 -- and length(win_bid_price)>0 -- and count(1)>1 -- ;
-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,win_bid_price,bidding_budget,1,doctitle_refine),38 -- from run_dumplicate_document_his -- group by docchannel,page_time,win_tenderer,agency -- having length(win_tenderer)>0 -- and length(agency)>0 -- and count(1)>1 -- ;
-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,tenderee,bidding_budget,1,doctitle_refine),39 -- from run_dumplicate_document_his -- group by docchannel,page_time,win_bid_price,agency -- having length(win_bid_price)>0 -- and length(agency)>0 -- and count(1)>1 -- ;
-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),14 from run_dumplicate_document_his where docchannel in (52,118) group by docchannel,project_code,tenderee,bidding_budget having project_code!="" and project_code is not NULL and tenderee!="" and tenderee is not NULL and count(1)>1;
-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),15 from run_dumplicate_document_his where docchannel in (101,119,120) group by docchannel,project_code,doctitle_refine,win_tenderer,win_bid_price having project_code!="" and project_code is not NULL and doctitle_refine!="" and doctitle_refine is not NULL and win_tenderer!="" and win_bid_price!="" and count(1)>1;
--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),16 from run_dumplicate_document_his where docchannel in (101,119,120) group by docchannel,project_code,win_tenderer,win_bid_price having project_code!="" and project_code is not NULL and win_tenderer!="" and win_bid_price!="" and count(1)>1;
--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),17 from run_dumplicate_document_his where docchannel in (101,119,120) group by docchannel,project_name,win_tenderer,win_bid_price having project_name!="" and project_name is not NULL and win_tenderer!="" and win_bid_price!="" and count(1)>1;
-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),18 from run_dumplicate_document_his where docchannel in (52,118) group by docchannel,doctitle_refine,agency,bidding_budget having doctitle_refine!="" and doctitle_refine is not NULL and agency!="" and agency is not NULL and count(1)>1;
-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),19 from run_dumplicate_document_his where docchannel='52' group by project_code,project_name,agency,bidding_budget having project_code!="" and project_code is not NULL and project_name!="" and project_name is not NULL and agency!="" and agency is not NULL and bidding_budget!="" and count(1)>1;
-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),20 from run_dumplicate_document_his where docchannel='52' group by project_code,project_name,tenderee,bidding_budget having project_code!="" and project_code is not NULL and project_name!="" and project_name is not NULL and tenderee!="" and tenderee is not NULL and bidding_budget!="" and count(1)>1;
-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),21 from run_dumplicate_document_his group by docchannel,doctitle_refine,tenderee,bidding_budget,page_time_stamp having docchannel in (51,103,115,116) and doctitle_refine!="" and tenderee!="" and tenderee is not NULL and bidding_budget!="" and count(1)>1;
-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),22 from run_dumplicate_document_his group by docchannel,project_code,tenderee,bidding_budget,page_time_stamp having docchannel in (51,103,115,116) and project_code!="" and project_code is not NULL and tenderee!="" and tenderee is not NULL and bidding_budget!="" and count(1)>1;
-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),23 from run_dumplicate_document_his group by docchannel,project_name,tenderee,bidding_budget,page_time_stamp having docchannel in (51,103,115,116) and project_name!="" and project_name is not NULL and tenderee!="" and tenderee is not NULL and bidding_budget!="" and count(1)>1;
-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),24 from run_dumplicate_document_his group by docchannel,doctitle_refine,agency,bidding_budget,page_time_stamp having docchannel in (51,103,115,116) and doctitle_refine!="" and agency!="" and agency is not NULL and bidding_budget!="" and count(1)>1;
-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),25 from run_dumplicate_document_his group by docchannel,project_code,agency,bidding_budget,page_time_stamp having docchannel in (51,103,115,116) and project_code!="" and project_code is not NULL and agency!="" and agency is not NULL and bidding_budget!="" and count(1)>1;
-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源 insert into document_group_his(json_set_docid,rule_id) select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),26 from run_dumplicate_document_his group by docchannel,project_name,agency,bidding_budget,page_time_stamp having docchannel in (51,103,115,116) and project_name!="" and project_name is not NULL and agency!="" and agency is not NULL and bidding_budget!="" and count(1)>1;
-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,agency,1,doctitle_refine),30 -- from run_dumplicate_document_his -- group by docchannel,page_time,tenderee,bidding_budget -- having length(tenderee)>0 -- and length(bidding_budget)>0 -- and count(1)>1 -- ;
-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,bidding_budget,win_bid_price,agency,1,doctitle_refine),31 -- from run_dumplicate_document_his -- group by docchannel,page_time,tenderee,win_tenderer -- having length(tenderee)>0 -- and length(win_tenderer)>0 -- and count(1)>1 -- ;
-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,bidding_budget,agency,1,doctitle_refine),32 -- from run_dumplicate_document_his -- group by docchannel,page_time,tenderee,win_bid_price -- having length(tenderee)>0 -- and length(win_bid_price)>0 -- and count(1)>1 -- ;
-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,bidding_budget,1,doctitle_refine),33 -- from run_dumplicate_document_his -- group by docchannel,page_time,tenderee,agency -- having length(tenderee)>0 -- and length(agency)>0 -- and count(1)>1 -- ;
-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值 -- insert into document_group_his(json_set_docid,rule_id) -- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,win_bid_price,agency,1,doctitle_refine),34 -- from run_dumplicate_document_his -- group by docchannel,page_time,win_tenderer,bidding_budget -- having length(win_tenderer)>0 -- and length(bidding_budget)>0 -- and count(1)>1 -- ;
--标题和类型相同的公告分为 编号 预算 中标人 中标价 代理都为空 及其它 两组 对这两组的数据进行匹配 规则是招标人相同且站源不同 insert into document_group_his(json_set_docid,rule_id) select F_SET_DOCID_BINARYCHART(docid,page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no),0 from run_dumplicate_document_his where 1=1 group by doctitle_refine,docchannel having length(doctitle_refine)>7 and count(1)>1;