|
@@ -556,7 +556,7 @@ def getPackageScopePattern():
|
|
|
return pattern
|
|
|
|
|
|
pattern_packageScope = getPackageScopePattern()
|
|
|
-def getPackagesFromArticle(list_sentence,list_entity):
|
|
|
+def getPackagesFromArticle_backup(list_sentence,list_entity):
|
|
|
'''
|
|
|
@param:
|
|
|
list_sentence:文章的句子list
|
|
@@ -784,6 +784,291 @@ def getPackagesFromArticle(list_sentence,list_entity):
|
|
|
PackageList.append(copy_pack)
|
|
|
return PackageList,PackageSet,dict_packageCode
|
|
|
|
|
|
+def getPackagesFromArticle(list_sentence, list_entity):
|
|
|
+ '''
|
|
|
+ @param:
|
|
|
+ list_sentence:文章的句子list
|
|
|
+ @summary: 将包的信息插入list_entity中
|
|
|
+ @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
|
|
|
+ '''
|
|
|
+
|
|
|
+ if len(list_sentence) == 0:
|
|
|
+ return None
|
|
|
+ list_sentence.sort(key=lambda x: x.sentence_index)
|
|
|
+
|
|
|
+ PackageList = []
|
|
|
+ PackageList_scope = []
|
|
|
+ PackageSet = set()
|
|
|
+ dict_packageCode = dict()
|
|
|
+
|
|
|
+ package_number_pattern = re.compile(
|
|
|
+ '((施工|监理|监测|勘察|设计)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标段?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]') # 标号
|
|
|
+ other_package_pattern = re.compile(
|
|
|
+ '((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]') # # 2020/11/23 大网站规则 调整 package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
|
|
|
+ win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]') # 2020/11/23 大网站规则 调整
|
|
|
+ model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]') # 2020/11/23 大网站规则 调整
|
|
|
+ number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
|
|
|
+
|
|
|
+ package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
|
|
|
+ # 纯数字类型的包号统一,例如:'01','1'
|
|
|
+ re_digital = re.compile("^\d+$")
|
|
|
+
|
|
|
+ def changeIndexFromWordToWords(tokens, word_index):
|
|
|
+ '''
|
|
|
+ @summary:转换某个字的字偏移为词偏移
|
|
|
+ '''
|
|
|
+ before_index = 0
|
|
|
+ after_index = 0
|
|
|
+ for i in range(len(tokens)):
|
|
|
+ after_index = after_index + len(tokens[i])
|
|
|
+ if before_index <= word_index and after_index >= word_index:
|
|
|
+ return i
|
|
|
+ before_index = after_index
|
|
|
+
|
|
|
+ package_names = []
|
|
|
+
|
|
|
+ def extractPackageCode(tokens, word_index, size=20, pattern=package_code_pattern):
|
|
|
+ '''
|
|
|
+ @summary:抽取包附近的标段号
|
|
|
+ @param:
|
|
|
+ tokens:包所在句子的分词
|
|
|
+ word_index:包所在字偏移
|
|
|
+ size:左右各取多少个词
|
|
|
+ pattern:提取标段号的正则
|
|
|
+ @return: type:string,meaning:标段号
|
|
|
+ '''
|
|
|
+ index = changeIndexFromWordToWords(tokens, word_index)
|
|
|
+ if index < size:
|
|
|
+ begin = index
|
|
|
+ else:
|
|
|
+ begin = index - size
|
|
|
+ if index + size > len(tokens):
|
|
|
+ end = len(tokens)
|
|
|
+ else:
|
|
|
+ end = index + size
|
|
|
+ # 拿到左右两边的词语组成短语
|
|
|
+ text = "".join(tokens[begin:end])
|
|
|
+ # 在短语中的字偏移
|
|
|
+ new_word_index = word_index - len("".join(tokens[:begin]))
|
|
|
+ min_distance = len(text)
|
|
|
+ packageCode = None
|
|
|
+ for the_iter in re.finditer(pattern, text):
|
|
|
+ # 算出最小距离
|
|
|
+ distance = min([abs(new_word_index - the_iter.span()[0]), abs(new_word_index - the_iter.span()[1])])
|
|
|
+ if distance < min_distance:
|
|
|
+ min_distance = distance
|
|
|
+ packageCode = the_iter.group(1)
|
|
|
+ return packageCode
|
|
|
+
|
|
|
+ def uniform_num(num):
|
|
|
+ d1 = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '10'}
|
|
|
+ # d2 = {'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5', 'F': '6', 'G': '7', 'H': '8', 'I': '9', 'J': '10'}
|
|
|
+ d3 = {'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5', 'Ⅵ': '6', 'Ⅶ': '7'}
|
|
|
+ if num.isdigit():
|
|
|
+ if re.search('^0[\d]$', num):
|
|
|
+ num = num[1:]
|
|
|
+ return num
|
|
|
+ elif re.search('^[一二三四五六七八九十]+$', num):
|
|
|
+ _digit = re.search('^[一二三四五六七八九十]+$', num).group(0)
|
|
|
+ if len(_digit) == 1:
|
|
|
+ num = d1[_digit]
|
|
|
+ elif len(_digit) == 2 and _digit[0] == '十':
|
|
|
+ num = '1'+ d1[_digit[1]]
|
|
|
+ elif len(_digit) == 3 and _digit[1] == '十':
|
|
|
+ num = d1[_digit[0]] + d1[_digit[2]]
|
|
|
+ elif re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num):
|
|
|
+ num = re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num).group(0)
|
|
|
+ num = d3[num]
|
|
|
+ return num
|
|
|
+
|
|
|
+ def uniform_package_name(package_name):
|
|
|
+ package_name = re.sub('pdf|doc|docs|xlsx', '', package_name)
|
|
|
+ kw = re.search('(施工|监理|监测|勘察|设计)', package_name)
|
|
|
+ name = ""
|
|
|
+ if kw:
|
|
|
+ name += kw.group(0)
|
|
|
+ if re.search('([a-zA-Z]包[:)]?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}标段?)', package_name): # 处理类似 A包2标段
|
|
|
+ _char = re.search('[a-zA-Z]', package_name).group(0)
|
|
|
+ _digit = re.search('[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}', package_name).group(0)
|
|
|
+ _digit = uniform_num(_digit)
|
|
|
+ name += _char + _digit
|
|
|
+ elif re.search('[a-zA-Z0-9-]{5,}', package_name): # 五个字符以上编号
|
|
|
+ _digit = re.search('[a-zA-Z0-9-]{5,}', package_name).group(0).upper()
|
|
|
+ name += _digit
|
|
|
+ elif re.search('[a-zA-Z]{1,4}[0-9]{,3}', package_name): # 英文的统一为大写
|
|
|
+ _digit = re.search('[a-zA-Z]{1,4}[0-9]{,3}', package_name).group(0).upper()
|
|
|
+ name += _digit
|
|
|
+ elif re.search('[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}', package_name): # 数字的统一的阿拉伯数字
|
|
|
+ _digit = re.search('[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}', package_name).group(0)
|
|
|
+ _digit = uniform_num(_digit)
|
|
|
+ name += _digit
|
|
|
+ if name == "":
|
|
|
+ return package_name
|
|
|
+ else:
|
|
|
+ # print('原始包号:%s, 处理后:%s'%(package_name, name))
|
|
|
+ return name
|
|
|
+
|
|
|
+ def get_package():
|
|
|
+ PackageList_scope = []
|
|
|
+ for i in range(len(list_sentence)):
|
|
|
+ PackageList_item = []
|
|
|
+ PackageList_item_scope = []
|
|
|
+ content = list_sentence[i].sentence_text
|
|
|
+ tokens = list_sentence[i].tokens
|
|
|
+ _names = []
|
|
|
+ for iter in re.finditer(package_number_pattern, content):
|
|
|
+ if re.match('\d', iter.group(0)) and iter.end() < len(content) and content[
|
|
|
+ iter.end()].isdigit(): # 排除2.10标段3 这种情况
|
|
|
+ continue
|
|
|
+ if re.search('承包|XX|xx', iter.group(0)) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
|
|
|
+ continue
|
|
|
+ temp_package_number = uniform_package_name(iter.group(0))
|
|
|
+ PackageList_item.append({"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index,
|
|
|
+ "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]),
|
|
|
+ "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]})
|
|
|
+ # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
|
|
|
+ code = extractPackageCode(tokens, iter.span()[0])
|
|
|
+ if code is not None:
|
|
|
+ dict_packageCode[temp_package_number] = code
|
|
|
+ PackageSet.add(temp_package_number)
|
|
|
+
|
|
|
+ # 识别packageScope
|
|
|
+ for iter in re.finditer(pattern_packageScope, content):
|
|
|
+ PackageList_item_scope.append({"name": "", "sentence_index": list_sentence[i].sentence_index,
|
|
|
+ "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]),
|
|
|
+ "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]})
|
|
|
+ # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
|
|
|
+ PackageList_item_scope = PackageList_item + PackageList_item_scope
|
|
|
+ PackageList_item_scope.sort(key=lambda x: x["offsetWord_begin"])
|
|
|
+ PackageList_scope = PackageList_scope + PackageList_item_scope
|
|
|
+ PackageList_item.sort(key=lambda x: x["sentence_index"])
|
|
|
+ return PackageList_scope
|
|
|
+
|
|
|
+ def get_win_project():
|
|
|
+ '''获取多个项目多个中标人的项目'''
|
|
|
+ PackageList_scope = []
|
|
|
+ # 2020/11/23 大网站规则 调整
|
|
|
+ if len(PackageSet) == 0 and len(
|
|
|
+ set([it.entity_text for it in list_entity if
|
|
|
+ it.entity_type in ['org', 'company'] and it.label == 2])) > 1:
|
|
|
+ for i in range(len(list_sentence)):
|
|
|
+ PackageList_item = []
|
|
|
+ PackageList_item_scope = []
|
|
|
+ content = list_sentence[i].sentence_text
|
|
|
+ tokens = list_sentence[i].tokens
|
|
|
+ names = re.findall(other_package_pattern, content)
|
|
|
+ N_names = re.findall(win_tenderer_pattern, content)
|
|
|
+ if len(names) != 1 or len(N_names) != 1:
|
|
|
+ continue
|
|
|
+ for iter in re.finditer(other_package_pattern, content):
|
|
|
+ temp_package_number = iter.group(4)
|
|
|
+ xinghao = re.search(model_pattern, content)
|
|
|
+ if xinghao:
|
|
|
+ temp_package_number = temp_package_number + '+' + xinghao.group(2)
|
|
|
+ # print('新正则采购包名补充',temp_package_number)
|
|
|
+ if re.search(re_digital, temp_package_number):
|
|
|
+ temp_package_number = str(int(temp_package_number))
|
|
|
+ PackageList_item.append(
|
|
|
+ {"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index,
|
|
|
+ "offsetWords_begin": changeIndexFromWordToWords(tokens, iter.span()[0]),
|
|
|
+ "offsetWord_begin": iter.span()[0], "offsetWord_end": iter.span()[1]})
|
|
|
+ # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
|
|
|
+ code = extractPackageCode(tokens, iter.span()[0])
|
|
|
+ if code is not None:
|
|
|
+ dict_packageCode[temp_package_number] = code
|
|
|
+ PackageSet.add(temp_package_number)
|
|
|
+ # 识别packageScope
|
|
|
+ for iter in re.finditer(pattern_packageScope, content):
|
|
|
+ PackageList_item_scope.append({"name": "", "sentence_index": list_sentence[i].sentence_index,
|
|
|
+ "offsetWords_begin": changeIndexFromWordToWords(tokens,
|
|
|
+ iter.span()[0]),
|
|
|
+ "offsetWord_begin": iter.span()[0],
|
|
|
+ "offsetWord_end": iter.span()[1]})
|
|
|
+ # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
|
|
|
+ PackageList_item_scope = PackageList_item + PackageList_item_scope
|
|
|
+ PackageList_item_scope.sort(key=lambda x: x["offsetWord_begin"])
|
|
|
+ PackageList_scope = PackageList_scope + PackageList_item_scope
|
|
|
+ PackageList_item.sort(key=lambda x: x["sentence_index"])
|
|
|
+ return PackageList_scope
|
|
|
+
|
|
|
+ def get_package_scope(PackageList_scope):
|
|
|
+ PackageList = []
|
|
|
+ pattern_punctuation = "[::()\(\),,。;;]"
|
|
|
+ # print("===packageList_scope",PackageList_scope)
|
|
|
+ for i in range(len(list_sentence)):
|
|
|
+ for j in range(len(PackageList_scope)):
|
|
|
+ if i == PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"] != "":
|
|
|
+ _flag = False
|
|
|
+ left_str = list_sentence[i].sentence_text[
|
|
|
+ PackageList_scope[j]["offsetWord_begin"] - 30:PackageList_scope[j][
|
|
|
+ "offsetWord_begin"] + 1]
|
|
|
+ right_str = list_sentence[i].sentence_text[
|
|
|
+ PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"] + 30]
|
|
|
+ _left_find = re.findall(pattern_punctuation, left_str)
|
|
|
+ _right_find = re.findall(pattern_punctuation, right_str)
|
|
|
+ # print(left_str)
|
|
|
+ if re.search("同", left_str[-1:]) is not None and PackageList_scope[j]["name"] == "一":
|
|
|
+ continue
|
|
|
+ if re.search("划分", right_str[:10]) is not None:
|
|
|
+ continue
|
|
|
+ if len(_left_find) > 0 and _left_find[-1] in [":", ":"]:
|
|
|
+ _flag = True
|
|
|
+ if len(_right_find) > 0 and _right_find[0] in [":", ":"]:
|
|
|
+ _flag = True
|
|
|
+ if _flag:
|
|
|
+ scope_begin = [PackageList_scope[j]["sentence_index"],
|
|
|
+ PackageList_scope[j]["offsetWords_begin"]]
|
|
|
+ else:
|
|
|
+ if j == 0:
|
|
|
+ scope_begin = [0, 0]
|
|
|
+ else:
|
|
|
+ scope_begin = [PackageList_scope[j - 1]["sentence_index"],
|
|
|
+ PackageList_scope[j - 1]["offsetWords_begin"]]
|
|
|
+ if j == len(PackageList_scope) - 1:
|
|
|
+ scope_end = [list_sentence[-1].sentence_index,
|
|
|
+ changeIndexFromWordToWords(list_sentence[-1].tokens,
|
|
|
+ len(list_sentence[
|
|
|
+ -1].sentence_text))]
|
|
|
+ else:
|
|
|
+ scope_end = [PackageList_scope[j + 1]["sentence_index"],
|
|
|
+ PackageList_scope[j + 1]["offsetWords_begin"]]
|
|
|
+ if PackageList_scope[j - 1]["sentence_index"] == PackageList_scope[j]["sentence_index"] and \
|
|
|
+ PackageList_scope[j - 1]["offsetWord_begin"] <= PackageList_scope[j]["offsetWord_begin"] and \
|
|
|
+ PackageList_scope[j - 1]["offsetWord_end"] >= PackageList_scope[j]["offsetWord_end"]:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # add package to entity
|
|
|
+ _pack_entity = Entity(doc_id=list_sentence[0].doc_id, entity_id="%s_%s_%s_%s" % (
|
|
|
+ list_sentence[0].doc_id, i, PackageList_scope[j]["offsetWord_begin"],
|
|
|
+ PackageList_scope[j]["offsetWord_begin"]), entity_text=PackageList_scope[j]["name"],
|
|
|
+ entity_type="package", sentence_index=PackageList_scope[j]["sentence_index"],
|
|
|
+ begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,
|
|
|
+ PackageList_scope[j][
|
|
|
+ "offsetWord_begin"]),
|
|
|
+ end_index=changeIndexFromWordToWords(list_sentence[i].tokens,
|
|
|
+ PackageList_scope[j]["offsetWord_end"]),
|
|
|
+ wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],
|
|
|
+ wordOffset_end=PackageList_scope[j]["offsetWord_end"],
|
|
|
+ in_attachment=list_sentence[i].in_attachment)
|
|
|
+ list_entity.append(_pack_entity)
|
|
|
+ copy_pack = copy.copy(PackageList_scope[j])
|
|
|
+ copy_pack["scope"] = [scope_begin, scope_end]
|
|
|
+ copy_pack["hit"] = set()
|
|
|
+ copy_pack["pointer"] = _pack_entity
|
|
|
+ PackageList.append(copy_pack)
|
|
|
+ return PackageList
|
|
|
+
|
|
|
+ PackageList_scope = get_package()
|
|
|
+ if len(PackageList_scope) > 0: # 找到标段
|
|
|
+ PackageList = get_package_scope(PackageList_scope)
|
|
|
+ else:
|
|
|
+ PackageList_scope = get_win_project()
|
|
|
+ if len(PackageList_scope) > 1: # 同时包含多标段及多中标人的
|
|
|
+ PackageList = get_package_scope(PackageList_scope)
|
|
|
+
|
|
|
+ return PackageList, PackageSet, dict_packageCode
|
|
|
+
|
|
|
+
|
|
|
# km配对方法
|
|
|
def dispatch(match_list):
|
|
|
main_roles = list(set([match.main_role for match in match_list]))
|
|
@@ -3047,7 +3332,8 @@ def limit_maximum_amount(prem, industry):
|
|
|
'工程评价服务': 100000000,
|
|
|
'其他工程服务': 100000000,
|
|
|
'工程监理服务': 100000000,
|
|
|
- '工程造价服务': 100000000
|
|
|
+ '工程造价服务': 100000000,
|
|
|
+ '会计、审计及税务服务': 100000000,
|
|
|
}
|
|
|
if indu in indu_amount:
|
|
|
maximum_amount = indu_amount[indu]
|