|
@@ -2201,6 +2201,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
|
article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed) # 2022/8/10统一表达
|
|
|
article_processed = article_processed.replace('\(%)', '') # 中标(成交)金额(元)\(%):498888.00, 处理 江西省政府采购网 金额特殊问题
|
|
|
article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20})):?', '金额:', article_processed) # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元 金额特殊问题
|
|
|
+ article_processed = re.sub('(不?含(可抵扣增值|\w{,8})税)', '', article_processed) # 120637247 投标报价(元),(含可抵扣增值税):277,560.00。
|
|
|
+ article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式:名称)', '供应商名称', article_processed) # 18889217, 84422177
|
|
|
ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
|
|
|
if ser:
|
|
|
article_processed = article_processed.replace(ser.group(0), '采购人名称:%s,采购代理机构名称:%s,' % (ser.group('tenderee'), ser.group('agency')))
|
|
@@ -2423,6 +2425,189 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
|
|
|
article.content = re.sub("##attachment_begin##|##attachment_end##", "", article.content)
|
|
|
return list_sentences,list_outlines
|
|
|
|
|
|
+def get_money_entity(sentence_text, found_yeji):
|
|
|
+ money_list = []
|
|
|
+ # 使用正则识别金额
|
|
|
+ entity_type = "money"
|
|
|
+ list_money_pattern = {"cn": "(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
|
|
|
+ "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_key_word>(E-?\d+))?[百千]{,1})(?:[(\(]?(?P<filter_>[%%‰折])*\s*(,?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
|
|
|
+ "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_front_m>(E-?\d+))?(?:,?)[百千]*)())",
|
|
|
+ "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_behind_m>(E-?\d+))?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
|
|
|
+ # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
|
|
|
+
|
|
|
+ pattern_money = re.compile("%s|%s|%s|%s" % (
|
|
|
+ list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
|
|
|
+ list_money_pattern["front_m"]))
|
|
|
+
|
|
|
+ if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
|
|
|
+ found_yeji += 1
|
|
|
+ if found_yeji >= 2: # 过滤掉业绩后面的所有金额
|
|
|
+ all_match = []
|
|
|
+ else:
|
|
|
+ ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text) # 过滤掉收费标准里面的金额
|
|
|
+ if ser:
|
|
|
+ all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' ' * len(ser.group(0))))
|
|
|
+ else:
|
|
|
+ all_match = re.finditer(pattern_money, sentence_text)
|
|
|
+ for _match in all_match:
|
|
|
+ # print('_match: ', _match.group())
|
|
|
+ if len(_match.group()) > 0:
|
|
|
+ # print("===",_match.group())
|
|
|
+ # # print(_match.groupdict())
|
|
|
+ notes = '' # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
|
|
|
+ unit = ""
|
|
|
+ entity_text = ""
|
|
|
+ start_index = ""
|
|
|
+ end_index = ""
|
|
|
+ text_beforeMoney = ""
|
|
|
+ filter = ""
|
|
|
+ filter_unit = False
|
|
|
+ notSure = False
|
|
|
+ science = ""
|
|
|
+ if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]): # 2021/7/21过滤掉业绩后面金额
|
|
|
+ # print('金额在业绩后面: ', _match.group(0))
|
|
|
+ found_yeji += 1
|
|
|
+ break
|
|
|
+ for k, v in _match.groupdict().items():
|
|
|
+ if v != "" and v is not None:
|
|
|
+ if k == 'text_key_word':
|
|
|
+ notSure = True
|
|
|
+ if k.split("_")[0] == "money":
|
|
|
+ entity_text = v
|
|
|
+ # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
|
|
|
+ if entity_text.endswith(',00'): # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
|
|
|
+ entity_text = entity_text[:-3]
|
|
|
+ if k.split("_")[0] == "unit":
|
|
|
+ if v == '万元' or unit == "": # 处理 预算金额(元):160万元 这种出现前后单位不一致情况
|
|
|
+ unit = v
|
|
|
+ if k.split("_")[0] == "text":
|
|
|
+ # print('text_before: ', _match.group(k))
|
|
|
+ text_beforeMoney = v
|
|
|
+ if k.split("_")[0] == "filter":
|
|
|
+ filter = v
|
|
|
+ if re.search("filter_unit", k) is not None:
|
|
|
+ filter_unit = True
|
|
|
+ if k.split("_")[0] == 'science':
|
|
|
+ science = v
|
|
|
+ # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text,unit,text_beforeMoney,filter,filter_unit))
|
|
|
+ # if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()): # 2021/7/19 修正OCR识别小数点为逗号
|
|
|
+ # if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0] - 2):_match.span()[0]]):
|
|
|
+ # entity_text = re.sub('\d+,', '', entity_text)
|
|
|
+ # else:
|
|
|
+ # entity_text = entity_text.replace(',', '.')
|
|
|
+ # # print(' 修正OCR识别小数点为逗号')
|
|
|
+
|
|
|
+ if filter != "":
|
|
|
+ continue
|
|
|
+ start_index, end_index = _match.span()
|
|
|
+ start_index += len(text_beforeMoney)
|
|
|
+ if unit == "": # 2021/7/21 有明显金额特征的补充单位,避免被过滤
|
|
|
+ if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
|
|
|
+ unit = '元'
|
|
|
+ # print('1明显金额特征补充单位 元')
|
|
|
+ elif re.search('USD[::]?$', text_beforeMoney):
|
|
|
+ unit = '美元'
|
|
|
+ elif re.search('EUR[::]?$', text_beforeMoney):
|
|
|
+ unit = '欧元'
|
|
|
+ elif re.search('JPY[::]?$', text_beforeMoney):
|
|
|
+ unit = '日元'
|
|
|
+ elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
|
|
|
+ # print('两个金额连接后面的有单位,用后面单位')
|
|
|
+ unit = '万元'
|
|
|
+ elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:
|
|
|
+ if re.search('^[\d,,.]+$', entity_text) and re.sub('[,,.]', '', entity_text).isdigit() and float(re.sub('[,,.]', '', entity_text))<500 and re.search('万元', sentence_text):
|
|
|
+ unit = '万元'
|
|
|
+ # print('金额较小且句子中有万元的,补充单位为万元')
|
|
|
+ else:
|
|
|
+ unit = '元'
|
|
|
+ # print('金额前面紧接关键词的补充单位 元')
|
|
|
+ elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|(^\d{,3}(,\d{3})+,?$)', entity_text):
|
|
|
+ unit = '元'
|
|
|
+ # print('3明显金额特征补充单位 元')
|
|
|
+ else:
|
|
|
+ # print('过滤掉没单位金额: ',entity_text)
|
|
|
+ continue
|
|
|
+ elif unit == '万元':
|
|
|
+ if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
|
|
|
+ unit = '元'
|
|
|
+ elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
|
|
|
+ unit = '元'
|
|
|
+ if unit.find("万") >= 0 and entity_text.find("万") >= 0: # 2021/7/19修改为金额文本有万,不计算单位
|
|
|
+ # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit)
|
|
|
+ unit = "元"
|
|
|
+ if re.search('.*万元万元', entity_text): # 2021/7/19 修正两个万元
|
|
|
+ # print(' 修正两个万元',entity_text)
|
|
|
+ entity_text = entity_text.replace('万元万元', '万元')
|
|
|
+ else:
|
|
|
+ if filter_unit:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else '' # 负值金额前面保留负号 ,后面这些不作为负金额 起拍价:105.29-200.46万元 预 算 --- 350000.0 2023/04/14 取消符号
|
|
|
+
|
|
|
+ entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
|
|
|
+ # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
|
|
|
+ if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算|投资额',
|
|
|
+ sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]): # 2021/8/5过滤掉总投资金额
|
|
|
+ # print('总投资金额: ', _match.group(0))
|
|
|
+ notes = '总投资'
|
|
|
+ elif re.search('投资|概算',
|
|
|
+ sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]): # 2021/11/18 投资金额不作为招标金额
|
|
|
+ notes = '投资'
|
|
|
+ elif re.search('工程造价',
|
|
|
+ sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]): # 2021/12/20 工程造价不作为招标金额
|
|
|
+ notes = '工程造价'
|
|
|
+ elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
|
|
|
+ or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\((]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\))]*[::为]',
|
|
|
+ sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
|
|
|
+ or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
|
|
|
+ sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
|
|
|
+ notes = '保证金'
|
|
|
+ # print('保证金信息:', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
|
|
|
+ elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
|
|
|
+ sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
|
|
|
+ notes = '成本警戒线'
|
|
|
+ elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]', sentence_text[_match.span()[0]:_match.span()[1]]):
|
|
|
+ cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
|
|
|
+ notes = cost_re.group(1)
|
|
|
+ elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
|
|
|
+ notes = '单价'
|
|
|
+ elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
|
|
|
+ notes = '大写'
|
|
|
+ if entity_text[0] == "拾": # 2021/12/16 修正大写金额省略了数字转换错误问题
|
|
|
+ entity_text = "壹" + entity_text
|
|
|
+ # print("补充备注:notes = 大写")
|
|
|
+ if len(unit) > 0:
|
|
|
+ if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8: # 2021/7/19 修正万元金额过大的情况
|
|
|
+ # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
|
|
|
+ entity_text = str(
|
|
|
+ getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
|
|
|
+ unit = '元' # 修正金额后单位 重置为元
|
|
|
+ else:
|
|
|
+ # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
|
|
|
+ entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
|
|
|
+ else:
|
|
|
+ if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
|
|
|
+ entity_text.split('.')[0]) >= 8:
|
|
|
+ entity_text = str(getUnifyMoney(entity_text) / 10000)
|
|
|
+ # print('修正金额字段含万 过大的情况')
|
|
|
+ else:
|
|
|
+ entity_text = str(getUnifyMoney(entity_text))
|
|
|
+ if science and re.search('^E-?\d+$', science): # 科学计数
|
|
|
+ entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
|
|
|
+ entity_text + science) < 10000000000 else entity_text # 结果大于100及小于100万才使用科学计算
|
|
|
+
|
|
|
+ if float(entity_text) > 100000000000: # float(entity_text)<100 or 2022/3/4 取消最小金额限制
|
|
|
+ # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
|
|
|
+ continue
|
|
|
+
|
|
|
+ if notSure and unit == "" and float(entity_text) > 100 * 10000:
|
|
|
+ # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
|
|
|
+ continue
|
|
|
+ # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
|
|
|
+ # filter, filter_unit))
|
|
|
+ money_list.append((entity_text, start_index, end_index, unit, notes))
|
|
|
+ return money_list, found_yeji
|
|
|
+
|
|
|
def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
'''
|
|
|
|
|
@@ -2595,193 +2780,34 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
|
|
|
second2last.is_tail = True
|
|
|
|
|
|
#使用正则识别金额
|
|
|
- entity_type = "money"
|
|
|
- list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
|
|
|
- "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>-*[0-9][\d,]*(?:\.\d+)?(?P<science_key_word>(E-?\d+))?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
|
|
|
- "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>-*[0-9][\d,]*(?:\.\d+)?(?P<science_front_m>(E-?\d+))?(?:,?)[百千]*)())",
|
|
|
- "behind_m":"(()()(?P<money_behind_m>-*[0-9][\d,]*(?:\.\d+)?(?P<science_behind_m>(E-?\d+))?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
|
|
|
- # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
|
|
|
-
|
|
|
- pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
|
|
|
-
|
|
|
- # if re.search('评标结果|候选人公示', sentence_text):
|
|
|
- # found_pingbiao = True
|
|
|
- if re.search('业绩', sentence_text):
|
|
|
- found_yeji += 1
|
|
|
- if found_yeji >= 2: # 过滤掉业绩后面的所有金额
|
|
|
- all_match = []
|
|
|
- else:
|
|
|
- ser = re.search('(收费标准|计算方式):\w{3,5}=[中标成交金额价格万元()\d%+*.-\[\]]+[,。]', sentence_text) # 过滤掉收费标准里面的金额
|
|
|
- if ser:
|
|
|
- all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' '*len(ser.group(0))))
|
|
|
- else:
|
|
|
- all_match = re.finditer(pattern_money, sentence_text)
|
|
|
- index = 0
|
|
|
- for _match in all_match:
|
|
|
- if len(_match.group())>0:
|
|
|
- # print("===",_match.group())
|
|
|
- # # print(_match.groupdict())
|
|
|
- notes = '' # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
|
|
|
- unit = ""
|
|
|
- entity_text = ""
|
|
|
- text_beforeMoney = ""
|
|
|
- filter = ""
|
|
|
- filter_unit = False
|
|
|
- notSure = False
|
|
|
- science = ""
|
|
|
- if re.search('业绩', sentence_text[:_match.span()[0]]): # 2021/7/21过滤掉业绩后面金额
|
|
|
- # print('金额在业绩后面: ', _match.group(0))
|
|
|
- found_yeji += 1
|
|
|
- break
|
|
|
- if (re.search('电话|编码|编号|号码|日期|时间|账号', sentence_text[max(0, _match.start()-12): _match.end()]) or re.search('^[a-zA-Z0-9+-]', sentence_text[_match.end():])) and re.search('[元¥¥]', _match.group(0)) == None:
|
|
|
- continue
|
|
|
|
|
|
- for k,v in _match.groupdict().items():
|
|
|
- if v!="" and v is not None:
|
|
|
- if k=='text_key_word':
|
|
|
- notSure = True
|
|
|
- if k.split("_")[0]=="money":
|
|
|
- entity_text = v
|
|
|
- if entity_text.endswith(',00'): # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
|
|
|
- entity_text = entity_text[:-3]
|
|
|
- if k.split("_")[0]=="unit":
|
|
|
- if v=='万元' or unit=="": # 处理 预算金额(元):160万元 这种出现前后单位不一致情况
|
|
|
- unit = v
|
|
|
- if k.split("_")[0]=="text":
|
|
|
- text_beforeMoney = v
|
|
|
- if k.split("_")[0]=="filter":
|
|
|
- filter = v
|
|
|
- if re.search("filter_unit",k) is not None:
|
|
|
- filter_unit = True
|
|
|
- if k.split("_")[0] == 'science':
|
|
|
- science = v
|
|
|
- # print(_match.group())
|
|
|
- # print(entity_text,unit,text_beforeMoney,filter,filter_unit)
|
|
|
- if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()): # 2021/7/19 修正OCR识别小数点为逗号
|
|
|
- if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0]-2):_match.span()[0]]):
|
|
|
- entity_text = re.sub('\d+,', '', entity_text)
|
|
|
- else:
|
|
|
- entity_text = entity_text.replace(',', '.')
|
|
|
- # print(' 修正OCR识别小数点为逗号')
|
|
|
-
|
|
|
- if entity_text.find("元")>=0:
|
|
|
- unit = ""
|
|
|
- if unit == "": #2021/7/21 有明显金额特征的补充单位,避免被过滤
|
|
|
- if ('¥' in text_beforeMoney or '¥' in text_beforeMoney):
|
|
|
- unit = '元'
|
|
|
- # print('明显金额特征补充单位 元')
|
|
|
- elif re.search('[单报标限]价|金额|价格|(监理|设计|勘察)(服务)?费[::为]+$', text_beforeMoney.strip()) and \
|
|
|
- re.search('\d{5,}',entity_text) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}',entity_text)==None:
|
|
|
- unit = '元'
|
|
|
- # print('明显金额特征补充单位 元')
|
|
|
- elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|(^\d{,3}(,\d{3})+,?$)',entity_text):
|
|
|
- unit = '元'
|
|
|
- # print('明显金额特征补充单位 元')
|
|
|
- if unit.find("万") >= 0 and entity_text.find("万") >= 0: #2021/7/19修改为金额文本有万,不计算单位
|
|
|
- # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit)
|
|
|
- unit = "元"
|
|
|
- if re.search('.*万元万元', entity_text): #2021/7/19 修正两个万元
|
|
|
- # print(' 修正两个万元',entity_text)
|
|
|
- entity_text = entity_text.replace('万元万元','万元')
|
|
|
- else:
|
|
|
- if filter_unit:
|
|
|
- continue
|
|
|
- if filter!="":
|
|
|
- continue
|
|
|
-
|
|
|
- index = _match.span()[0]+len(text_beforeMoney)
|
|
|
- begin_index_temp = index
|
|
|
- for j in range(len(list_tokenbegin)):
|
|
|
- if list_tokenbegin[j]==index:
|
|
|
- begin_index = j
|
|
|
- break
|
|
|
- elif list_tokenbegin[j]>index:
|
|
|
- begin_index = j-1
|
|
|
- break
|
|
|
- index = _match.span()[1]
|
|
|
- end_index_temp = index
|
|
|
- #index += len(str(all_match[i][0]))
|
|
|
- for j in range(begin_index,len(list_tokenbegin)):
|
|
|
- if list_tokenbegin[j]>=index:
|
|
|
- end_index = j-1
|
|
|
- break
|
|
|
- entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
|
|
|
-
|
|
|
- symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else '' # 负值金额前面保留负号 ,后面这些不作为负金额 起拍价:105.29-200.46万元 预 算 --- 350000.0
|
|
|
-
|
|
|
- entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
|
|
|
- # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
|
|
|
- if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]): # 2021/8/5过滤掉总投资金额
|
|
|
- # print('总投资金额: ', _match.group(0))
|
|
|
- notes = '总投资'
|
|
|
- elif re.search('投资|概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]): # 2021/11/18 投资金额不作为招标金额
|
|
|
- notes = '投资'
|
|
|
- elif re.search('工程造价', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]): # 2021/12/20 工程造价不作为招标金额
|
|
|
- notes = '工程造价'
|
|
|
- elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
|
|
|
- or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\((]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\))]*[::为]',
|
|
|
- sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
|
|
|
- or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
|
|
|
- sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
|
|
|
- notes = '保证金'
|
|
|
- # print('保证金信息:', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
|
|
|
- elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
|
|
|
- sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
|
|
|
- notes = '成本警戒线'
|
|
|
- elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]', sentence_text[_match.span()[0]:_match.span()[1]]):
|
|
|
- cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
|
|
|
- notes = cost_re.group(1)
|
|
|
- elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
|
|
|
- notes = '单价'
|
|
|
- elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
|
|
|
- notes = '大写'
|
|
|
- if entity_text[0] == "拾": # 2021/12/16 修正大写金额省略了数字转换错误问题
|
|
|
- entity_text = "壹"+entity_text
|
|
|
- # print("补充备注:notes = 大写")
|
|
|
- if len(unit)>0:
|
|
|
- if unit.find('万')>=0 and len(entity_text.split('.')[0])>=8: # 2021/7/19 修正万元金额过大的情况
|
|
|
- # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
|
|
|
- entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]","",unit)[0])/10000)
|
|
|
- unit = '元' # 修正金额后单位 重置为元
|
|
|
- else:
|
|
|
- # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
|
|
|
- entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(re.sub("[美日欧]","",unit)[0]))
|
|
|
- else:
|
|
|
- if entity_text.find('万')>=0 and entity_text.split('.')[0].isdigit() and len(entity_text.split('.')[0])>=8:
|
|
|
- entity_text = str(getUnifyMoney(entity_text)/10000)
|
|
|
- # print('修正金额字段含万 过大的情况')
|
|
|
- else:
|
|
|
- entity_text = str(getUnifyMoney(entity_text))
|
|
|
- if science and re.search('^E-?\d+$', science): # 科学计数
|
|
|
- entity_text = str(Decimal(entity_text+science)) if Decimal(entity_text+science) > 100 and Decimal(entity_text+science) < 10000000000 else entity_text # 结果大于100及小于100万才使用科学计算
|
|
|
-
|
|
|
- if float(entity_text)>100000000000: # float(entity_text)<100 or 2022/3/4 取消最小金额限制
|
|
|
- # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
|
|
|
- continue
|
|
|
-
|
|
|
- if notSure and unit=="" and float(entity_text)>100*10000:
|
|
|
- # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
|
|
|
- continue
|
|
|
-
|
|
|
-
|
|
|
- _exists = False
|
|
|
- for item in list_sentence_entitys:
|
|
|
- if item.entity_id==entity_id and item.entity_type==entity_type:
|
|
|
- _exists = True
|
|
|
- if (begin_index >=item.begin_index and begin_index<=item.end_index) or (end_index>=item.begin_index and end_index<=item.end_index):
|
|
|
- _exists = True
|
|
|
- if not _exists:
|
|
|
- if float(entity_text)>1:
|
|
|
- if symbol == '-': # 负值金额保留负号
|
|
|
- entity_text = '-'+entity_text
|
|
|
- list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp,in_attachment=in_attachment))
|
|
|
- list_sentence_entitys[-1].notes = notes # 2021/7/20 新增金额备注
|
|
|
- list_sentence_entitys[-1].money_unit = unit # 2021/7/20 新增金额备注
|
|
|
- # print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
|
|
|
- # print(entity_text,unit,notes)
|
|
|
-
|
|
|
- else:
|
|
|
- index += 1
|
|
|
+ money_list, found_yeji = get_money_entity(sentence_text, found_yeji)
|
|
|
+ entity_type = "money"
|
|
|
+ for money in money_list:
|
|
|
+ # print('money: ', money)
|
|
|
+ entity_text, begin_index, end_index, unit, notes = money
|
|
|
+ end_index = end_index - 1 if entity_text.endswith(',') else end_index
|
|
|
+ entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
|
|
|
+ _exists = False
|
|
|
+ for item in list_sentence_entitys:
|
|
|
+ if item.entity_id==entity_id and item.entity_type==entity_type:
|
|
|
+ _exists = True
|
|
|
+ if (begin_index >=item.wordOffset_begin and begin_index<item.wordOffset_end) or (end_index>item.wordOffset_begin and end_index<=item.wordOffset_end):
|
|
|
+ _exists = True
|
|
|
+ # print('_exists: ',begin_index, end_index, item.wordOffset_begin, item.wordOffset_end, item.entity_text, item.entity_type)
|
|
|
+ if not _exists:
|
|
|
+ if float(entity_text)>1:
|
|
|
+ # if symbol == '-': # 负值金额保留负号
|
|
|
+ # entity_text = '-'+entity_text # 20230414 取消符号
|
|
|
+ begin_words = changeIndexFromWordToWords(tokens, begin_index)
|
|
|
+ end_words = changeIndexFromWordToWords(tokens, end_index)
|
|
|
+ # print('金额位置: ', begin_index, begin_words,end_index, end_words)
|
|
|
+ # print('金额召回: ', entity_text, sentence_text[begin_index:end_index], tokens[begin_words:end_words])
|
|
|
+ list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_words,end_words,begin_index,end_index,in_attachment=in_attachment))
|
|
|
+ list_sentence_entitys[-1].notes = notes # 2021/7/20 新增金额备注
|
|
|
+ list_sentence_entitys[-1].money_unit = unit # 2021/7/20 新增金额备注
|
|
|
+ # print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
|
|
|
+ # print(entity_text,unit,notes)
|
|
|
|
|
|
# "联系人"正则补充提取 2021/11/15 新增
|
|
|
list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']
|