|
@@ -661,7 +661,104 @@ def load(path):
|
|
|
with open(path, 'rb') as f:
|
|
|
object1 = pickle.load(f)
|
|
|
return object1
|
|
|
-
|
|
|
+
|
|
|
+
|
|
|
+def uniform_num(num):
|
|
|
+ d1 = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '10'}
|
|
|
+ # d2 = {'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5', 'F': '6', 'G': '7', 'H': '8', 'I': '9', 'J': '10'}
|
|
|
+ d3 = {'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5', 'Ⅵ': '6', 'Ⅶ': '7'}
|
|
|
+ if num.isdigit():
|
|
|
+ if re.search('^0[\d]$', num):
|
|
|
+ num = num[1:]
|
|
|
+ return num
|
|
|
+ elif re.search('^[一二三四五六七八九十]+$', num):
|
|
|
+ _digit = re.search('^[一二三四五六七八九十]+$', num).group(0)
|
|
|
+ if len(_digit) == 1:
|
|
|
+ num = d1[_digit]
|
|
|
+ elif len(_digit) == 2 and _digit[0] == '十':
|
|
|
+ num = '1'+ d1[_digit[1]]
|
|
|
+ elif len(_digit) == 2 and _digit[1] == '十':
|
|
|
+ num = d1[_digit[0]] + '0'
|
|
|
+ elif len(_digit) == 3 and _digit[1] == '十':
|
|
|
+ num = d1[_digit[0]] + d1[_digit[2]]
|
|
|
+ elif re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num):
|
|
|
+ num = re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num).group(0)
|
|
|
+ num = d3[num]
|
|
|
+ return num
|
|
|
+
|
|
|
+def uniform_package_name(package_name):
|
|
|
+ '''
|
|
|
+ 统一规范化包号。数值类型统一为阿拉伯数字,字母统一为大写,包含施工监理等抽到前面, 例 A包监理一标段 统一为 监理A1 ; 包Ⅱ 统一为 2
|
|
|
+ :param package_name: 字符串类型 包号
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ package_name_raw = package_name
|
|
|
+ package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name)
|
|
|
+ package_name = package_name.replace('标段(包)', '标段').replace('№', '')
|
|
|
+ package_name = re.sub('\[|【', '', package_name)
|
|
|
+ kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name)
|
|
|
+ name = ""
|
|
|
+ if kw:
|
|
|
+ name += kw.group(0)
|
|
|
+ if re.search('^[a-zA-Z0-9-]{5,}$', package_name): # 五个字符以上编号
|
|
|
+ _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
|
|
|
+ # print('规范化包号1', _digit)
|
|
|
+ name += _digit
|
|
|
+ elif re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
|
|
|
+ ser = re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)
|
|
|
+ # print('规范化包号2', ser.group(0))
|
|
|
+ _char = ser.groupdict().get('eng')
|
|
|
+ if _char:
|
|
|
+ _char = _char.upper()
|
|
|
+ _digit = ser.groupdict().get('num')
|
|
|
+ _digit = uniform_num(_digit)
|
|
|
+ name += _char.upper() + _digit
|
|
|
+ elif re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name): # 处理类似 A包2标段
|
|
|
+ ser = re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name)
|
|
|
+ # print('规范化包号3', ser.group(0))
|
|
|
+ _char = ser.groupdict().get('eng')
|
|
|
+ if _char:
|
|
|
+ _char = _char.upper()
|
|
|
+ _digit = ser.groupdict().get('num')
|
|
|
+ _digit = uniform_num(_digit)
|
|
|
+ if _char:
|
|
|
+ name += _char.upper()
|
|
|
+ name += _digit
|
|
|
+ elif re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name): # 数字的统一的阿拉伯数字
|
|
|
+ ser = re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
|
|
|
+ # print('规范化包号4', ser.group(0))
|
|
|
+ _char = ser.groupdict().get('eng')
|
|
|
+ if _char:
|
|
|
+ _char = _char.upper()
|
|
|
+ _digit = ser.groupdict().get('num')
|
|
|
+ _digit = uniform_num(_digit)
|
|
|
+ if _char:
|
|
|
+ name += _char.upper()
|
|
|
+ name += _digit
|
|
|
+ elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z-]{1,5})', package_name): # 数字的统一的阿拉伯数字
|
|
|
+ _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z-]{1,5})', package_name).group('eng').upper()
|
|
|
+ # print('规范化包号5', _digit)
|
|
|
+ name += _digit
|
|
|
+ elif re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name): # 数字的统一的阿拉伯数字
|
|
|
+ _digit = re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper()
|
|
|
+ # print('规范化包号6', _digit)
|
|
|
+ name += _digit
|
|
|
+ elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name): # 数字的统一的阿拉伯数字
|
|
|
+ _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
|
|
|
+ # print('规范化包号7', _digit)
|
|
|
+ _digit = uniform_num(_digit)
|
|
|
+ name += _digit
|
|
|
+ elif re.search('^[a-zA-Z0-9-]+$', package_name):
|
|
|
+ _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0)
|
|
|
+ # print('规范化包号8', _char)
|
|
|
+ name += _char.upper()
|
|
|
+ if name == "":
|
|
|
+ return package_name_raw
|
|
|
+ else:
|
|
|
+ if name.isdigit():
|
|
|
+ name = str(int(name))
|
|
|
+ # print('原始包号:%s, 处理后:%s'%(package_name, name))
|
|
|
+ return name
|
|
|
|
|
|
def getIndexOfWord_fool(word):
|
|
|
|