|
@@ -999,17 +999,68 @@ def check_codes(project_codes_less,project_codes_greater):
|
|
|
def check_demand():
|
|
|
return True
|
|
|
|
|
|
+def edit_distance_with_diff(s1, s2):
|
|
|
+ m, n = len(s1), len(s2)
|
|
|
+ # 创建动态规划表
|
|
|
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
|
+
|
|
|
+ # 初始化动态规划表
|
|
|
+ for i in range(m + 1):
|
|
|
+ dp[i][0] = i
|
|
|
+ for j in range(n + 1):
|
|
|
+ dp[0][j] = j
|
|
|
+
|
|
|
+ # 填充动态规划表
|
|
|
+ for i in range(1, m + 1):
|
|
|
+ for j in range(1, n + 1):
|
|
|
+ if s1[i - 1] == s2[j - 1]:
|
|
|
+ dp[i][j] = dp[i - 1][j - 1]
|
|
|
+ else:
|
|
|
+ dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
|
|
|
+
|
|
|
+ # 回溯找到差异部分
|
|
|
+ diff = []
|
|
|
+ i, j = m, n
|
|
|
+ while i > 0 and j > 0:
|
|
|
+ if s1[i - 1] == s2[j - 1]:
|
|
|
+ i -= 1
|
|
|
+ j -= 1
|
|
|
+ elif dp[i][j] == dp[i - 1][j] + 1:
|
|
|
+ diff.append(("删除",s1[i - 1]))
|
|
|
+ i -= 1
|
|
|
+ elif dp[i][j] == dp[i][j - 1] + 1:
|
|
|
+ diff.append(("插入",s2[j - 1]))
|
|
|
+ j -= 1
|
|
|
+ else:
|
|
|
+ diff.append(("替换",s1[i - 1],s2[j - 1]))
|
|
|
+ i -= 1
|
|
|
+ j -= 1
|
|
|
+
|
|
|
+ # 处理剩余部分
|
|
|
+ while i > 0:
|
|
|
+ diff.append(("删除",s1[i - 1]))
|
|
|
+ i -= 1
|
|
|
+ while j > 0:
|
|
|
+ diff.append(("插入",s2[j - 1]))
|
|
|
+ j -= 1
|
|
|
+
|
|
|
+ # 返回编辑距离和差异部分
|
|
|
+ return dp[m][n], diff[::-1] # 将差异部分反转,因为我们是从后往前回溯的
|
|
|
+
|
|
|
package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))") # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
|
|
|
code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
|
|
|
num_pattern = re.compile("^\d+(?:\.\d+)?$")
|
|
|
num1_pattern = re.compile("[一二三四五六七八九十A-Za-z]+")
|
|
|
+num2_pattern = re.compile("[一二三四五六七八九十A-Za-z\d-]+")
|
|
|
+num3_pattern = re.compile("[一二三四五六七八九十A-Za-z\d-]+|.")
|
|
|
location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
|
|
|
building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九十1-9]+[)\)]?[次批]"
|
|
|
# 标题中被括号括起来的重点内容
|
|
|
brackets_pattern = "【([^【】]+?)】" # |{([^{}]+?)}
|
|
|
rebid_pattern = "再次|重新招标|[一二三四五六七八九十]+次"
|
|
|
date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
|
|
|
-def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[],page_time_less="",page_time_greater=""):
|
|
|
+def check_doctitle(doctitle_refind_less, doctitle_refind_greater,docchannel_less,docchannel_greater, codes_less=[], code_greater=[],page_time_less="",page_time_greater=""):
|
|
|
+ # print('doctitle',doctitle_refind_less,doctitle_refind_greater)
|
|
|
if code_greater is None:
|
|
|
code_greater = []
|
|
|
doctitle_refind_less = str(doctitle_refind_less).replace("(","(").replace(")",")")
|
|
@@ -1026,6 +1077,8 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
|
|
|
doctitle_refind_less = ""
|
|
|
if doctitle_refind_greater is None:
|
|
|
doctitle_refind_greater = ""
|
|
|
+ if doctitle_refind_less==doctitle_refind_greater:
|
|
|
+ return True
|
|
|
_pack1 = None
|
|
|
_pack2 = None
|
|
|
#if contain then pass
|
|
@@ -1078,6 +1131,26 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
|
|
|
if len(set_num_l)==len(set_num_g):
|
|
|
if len(set_num_l&set_num_g)!=len(set_num_l):
|
|
|
return False
|
|
|
+ # 产权拍卖类公告,例:小区6号楼2单元1302号
|
|
|
+ if docchannel_less==docchannel_greater and docchannel_less in [115,116,117]:
|
|
|
+ for _p in [num2_pattern]:
|
|
|
+ num_all_l = re.findall(_p, doctitle_refind_less)
|
|
|
+ num_all_g = re.findall(_p, doctitle_refind_greater)
|
|
|
+ set_num_l = set(num_all_l)
|
|
|
+ set_num_g = set(num_all_g)
|
|
|
+ if len(set_num_l) == len(set_num_g):
|
|
|
+ if len(set_num_l & set_num_g) != len(set_num_l):
|
|
|
+ return False
|
|
|
+ # 相似标题对比,编辑距离中替换字段前后都为"数字字母字符串"则判断为不同
|
|
|
+ if getSimilarityOfString(doctitle_refind_less,doctitle_refind_greater) > 0.7:
|
|
|
+ doctitle_refind_less_re = re.findall(num3_pattern,doctitle_refind_less)
|
|
|
+ doctitle_refind_greater_re = re.findall(num3_pattern,doctitle_refind_greater)
|
|
|
+ distance, differences = edit_distance_with_diff(doctitle_refind_less_re, doctitle_refind_greater_re)
|
|
|
+ for diff in differences:
|
|
|
+ if diff[0]=='替换':
|
|
|
+ if re.search("^[一二三四五六七八九十A-Za-z\d-]+$",diff[1]) and re.search("^[一二三四五六七八九十A-Za-z\d-]+$",diff[2]):
|
|
|
+ # print("标题编辑距离中替换字段前后 数字字母字符串不同")
|
|
|
+ return False
|
|
|
# 重新(多次)招标关键词
|
|
|
for _p in [rebid_pattern]:
|
|
|
num_all_l = re.findall(_p,doctitle_refind_less)
|
|
@@ -1087,7 +1160,8 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
|
|
|
if len(set_num_l)==len(set_num_g):
|
|
|
if len(set_num_l&set_num_g)!=len(set_num_l):
|
|
|
return False
|
|
|
- elif (len(set_num_l) and not len(set_num_g)) or (len(set_num_g) and not len(set_num_l)):
|
|
|
+ # if page_time_less and page_time_less != page_time_greater:
|
|
|
+ if (len(set_num_l) and not len(set_num_g)) or (len(set_num_g) and not len(set_num_l)):
|
|
|
return False
|
|
|
|
|
|
#check the location has conflict
|
|
@@ -1206,23 +1280,30 @@ def check_package(package_less,package_greater,split_char=","):
|
|
|
def check_time(json_time_less,json_time_greater):
|
|
|
has_same = False
|
|
|
has_diff = False
|
|
|
+ time_count_less = 0
|
|
|
+ time_count_greater = 0
|
|
|
if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
|
|
|
if isinstance(json_time_less,dict):
|
|
|
time_less = json_time_less
|
|
|
else:
|
|
|
time_less = json.loads(json_time_less)
|
|
|
+ time_count_less += sum([1 for k,v in time_less.items() if v])
|
|
|
if isinstance(json_time_greater,dict):
|
|
|
time_greater = json_time_greater
|
|
|
else:
|
|
|
time_greater = json.loads(json_time_greater)
|
|
|
+ time_count_greater += sum([1 for k, v in time_greater.items() if v])
|
|
|
for k,v in time_less.items():
|
|
|
if getLength(v)>0:
|
|
|
v1 = time_greater.get(k,"")
|
|
|
if getLength(v1)>0:
|
|
|
if v[:10]!=v1[:10]:
|
|
|
+ # print('time diff',k,v,v1)
|
|
|
has_diff = True
|
|
|
else:
|
|
|
has_same = True
|
|
|
+ if time_count_less==0 and time_count_greater==0:
|
|
|
+ return 2
|
|
|
if has_same:
|
|
|
if has_diff:
|
|
|
return 1
|
|
@@ -1299,6 +1380,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
is_special_bonds_less = document_less.get("is_special_bonds")
|
|
|
products_less = document_less.get("products")
|
|
|
products_original_less = document_less.get("products_original",[])
|
|
|
+ change_content_less = document_less.get("change_content","")
|
|
|
+ change_time_less = document_less.get("change_time","")
|
|
|
|
|
|
|
|
|
docid_greater = document_greater["docid"]
|
|
@@ -1327,6 +1410,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
is_special_bonds_greater = document_greater.get("is_special_bonds")
|
|
|
products_greater = document_greater.get("products")
|
|
|
products_original_greater = document_greater.get("products_original", [])
|
|
|
+ change_content_greater = document_greater.get("change_content", "")
|
|
|
+ change_time_greater = document_greater.get("change_time", "")
|
|
|
|
|
|
moneys_greater = document_greater.get("moneys")
|
|
|
moneys_attachment_greater = document_greater.get("moneys_attachment")
|
|
@@ -1394,6 +1479,30 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
# print("check_products error")
|
|
|
return 0
|
|
|
|
|
|
+ # 变更答疑公告 变更内容对比
|
|
|
+ if docchannel_less in [51,103] and docchannel_less==docchannel_greater:
|
|
|
+ if getLength(change_time_less)>0 and getLength(change_time_greater)>0:
|
|
|
+ if change_time_less != change_time_greater:
|
|
|
+ # print("change_time diff")
|
|
|
+ return 0
|
|
|
+ if getLength(change_content_less) > 10 and getLength(change_content_greater) > 10:
|
|
|
+ _change_content_less = re.findall("[\u4e00-\u9fa5a-zA-Z0-9]+", change_content_less)
|
|
|
+ _change_content_less = "".join(_change_content_less)
|
|
|
+ _change_content_greater = re.findall("[\u4e00-\u9fa5a-zA-Z0-9]+", change_content_greater)
|
|
|
+ _change_content_greater = "".join(_change_content_greater)
|
|
|
+ if _change_content_less == _change_content_greater:
|
|
|
+ # print("change_content same 1")
|
|
|
+ return 1
|
|
|
+ elif _change_content_less.find(_change_content_greater)>=0 or _change_content_greater.find(_change_content_less)>=0:
|
|
|
+ # print("change_content same 2")
|
|
|
+ return 1
|
|
|
+ # elif getSimilarityOfString(_change_content_less,_change_content_greater)>0.8:
|
|
|
+ # print("change_content same 3")
|
|
|
+ # print(_change_content_less)
|
|
|
+ # print(_change_content_greater)
|
|
|
+ # print(getSimilarityOfString(_change_content_less,_change_content_greater))
|
|
|
+ # return 1
|
|
|
+
|
|
|
#一篇要素都在附件,且两篇附件md5有重叠
|
|
|
set_md5_less = set()
|
|
|
set_md5_greater = set()
|
|
@@ -1572,7 +1681,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
check_result["pass"] = 0
|
|
|
else:
|
|
|
check_result["docchannel"] = 2
|
|
|
- if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater,page_time_less,page_time_greater):
|
|
|
+ if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,docchannel_less,docchannel_greater,project_codes_less,project_codes_greater,page_time_less,page_time_greater):
|
|
|
check_result["doctitle"] = 0
|
|
|
check_result["pass"] = 0
|
|
|
if b_log:
|
|
@@ -1654,7 +1763,9 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
|
|
|
|
|
|
#added check
|
|
|
_time_check = check_time(json_time_less,json_time_greater)
|
|
|
- if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
|
|
|
+ # if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
|
|
|
+ if not _time_check or (_time_check==1 and docchannel_less in (51,103) and
|
|
|
+ len([k for k,v in json_time_less.items() if v])>0 and len([k for k,v in json_time_greater.items() if v])>0):
|
|
|
if b_log:
|
|
|
logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
|
|
|
if isinstance(json_time_less,dict):
|