|
@@ -1005,6 +1005,8 @@ num_pattern = re.compile("^\d+(?:\.\d+)?$")
|
|
|
num1_pattern = re.compile("[一二三四五六七八九十A-Za-z]+")
|
|
|
location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
|
|
|
building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九十1-9]+[)\)]?[次批]"
|
|
|
+# 标题中被括号括起来的重点内容
|
|
|
+brackets_pattern = "【([^【】]+?)】" # |{([^{}]+?)}
|
|
|
rebid_pattern = "再次|重新招标|[一二三四五六七八九十]+次"
|
|
|
date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
|
|
|
def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[],page_time_less="",page_time_greater=""):
|
|
@@ -1068,7 +1070,7 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
|
|
|
return False
|
|
|
|
|
|
#check location and keywords
|
|
|
- for _p in [num1_pattern,building_pattern]:
|
|
|
+ for _p in [num1_pattern,building_pattern,brackets_pattern]:
|
|
|
num_all_l = re.findall(_p,doctitle_refind_less)
|
|
|
num_all_g = re.findall(_p,doctitle_refind_greater)
|
|
|
set_num_l = set(num_all_l)
|
|
@@ -1404,21 +1406,21 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
_md5 = _l.get("fileMd5")
|
|
|
if _md5 is not None:
|
|
|
set_md5_greater.add(_md5)
|
|
|
- if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
|
|
|
- one_in_attach = False
|
|
|
- dict_enterprise_less = json.loads(nlp_enterprise_less)
|
|
|
- dict_enterprise_greater = json.loads(nlp_enterprise_greater)
|
|
|
- indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
|
|
|
- notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
|
|
|
- indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
|
|
|
- notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
|
|
|
- if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
|
|
|
- one_in_attach = True
|
|
|
- if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
|
|
|
- one_in_attach = True
|
|
|
- if one_in_attach:
|
|
|
- if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
|
|
|
- return 1
|
|
|
+ # if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
|
|
|
+ # one_in_attach = False
|
|
|
+ # dict_enterprise_less = json.loads(nlp_enterprise_less)
|
|
|
+ # dict_enterprise_greater = json.loads(nlp_enterprise_greater)
|
|
|
+ # indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
|
|
|
+ # notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
|
|
|
+ # indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
|
|
|
+ # notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
|
|
|
+ # if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
|
|
|
+ # one_in_attach = True
|
|
|
+ # if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
|
|
|
+ # one_in_attach = True
|
|
|
+ # if one_in_attach:
|
|
|
+ # if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
|
|
|
+ # return 1
|
|
|
|
|
|
#同一个站源,都有附件但附件没有重叠则不去重
|
|
|
if web_source_no_less==web_source_no_greater and len(set_md5_less)>0 and len(set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==0:
|