Pārlūkot izejas kodu

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

znj 2 gadi atpakaļ
vecāks
revīzija
3e08f3f58f

+ 5 - 1
BiddingKG/dl/common/Utils.py

@@ -305,6 +305,8 @@ def changeIndexFromWordToWords(tokens,word_index):
         if before_index<=word_index and after_index>word_index:
             return i
         before_index = after_index
+    return i+1
+
         
 def getIndexOfWords(words):
     global vocab_words,file_vocab_words
@@ -895,6 +897,8 @@ def uniform_package_name(package_name):
     if name == "":
         return package_name_raw
     else:
+        if name.isdigit():
+            name = str(int(name))
         # print('原始包号:%s, 处理后:%s'%(package_name, name))
         return name
 
@@ -907,7 +911,7 @@ def money_process(money_text, header):
     '''
     money = 0
     money_unit = ""
-    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", money_text)
+    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
     if re_price:
         money_text = re_price.group(0)
         if '万元' in header and '万' not in money_text:

+ 215 - 187
BiddingKG/dl/interface/Preprocessing.py

@@ -1863,6 +1863,8 @@ def special_treatment(sourceContent, web_source_no):
         elif web_source_no=='00811-8':
             if re.search('是否中标:是', sourceContent) and re.search('排名:\d,', sourceContent):
                 sourceContent = re.sub('排名:\d,', '候选', sourceContent)
+        elif web_source_no=='DX000726-6':
+            sourceContent = re.sub('卖方[::\s]+宝山钢铁股份有限公司', '招标单位:宝山钢铁股份有限公司', sourceContent)
         return sourceContent
     except Exception as e:
         log('特殊数据源: %s 预处理特别修改抛出异常: %s'%(web_source_no, e))
@@ -2201,6 +2203,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
         article_processed = article_processed.replace('\(%)', '')    # 中标(成交)金额(元)\(%):498888.00, 处理 江西省政府采购网  金额特殊问题
         article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20})):?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题
+        article_processed = re.sub('(不?含(可抵扣增值|\w{,8})税)', '', article_processed)    # 120637247 投标报价(元),(含可抵扣增值税):277,560.00。
+        article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式:名称)', '供应商名称', article_processed)  # 18889217, 84422177
         ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
         if ser:
             article_processed = article_processed.replace(ser.group(0), '采购人名称:%s,采购代理机构名称:%s,' % (ser.group('tenderee'), ser.group('agency')))
@@ -2229,7 +2233,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
             article_processed_list[1] = attachment_text
             article_processed = "##attachment##".join(article_processed_list)
         '''特别数据源对 预处理后文本 做特别修改'''
-        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2", '00811-8', '03795-1', '03795-2']:
+        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2", '00811-8', '03795-1', '03795-2', 'DX000726-6']:
             article_processed = special_treatment(article_processed, web_source_no)
 
         # 提取bidway
@@ -2423,6 +2427,189 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
         article.content = re.sub("##attachment_begin##|##attachment_end##", "", article.content)
     return list_sentences,list_outlines
 
+def get_money_entity(sentence_text, found_yeji):
+    money_list = []
+    # 使用正则识别金额
+    entity_type = "money"
+    list_money_pattern = {"cn": "(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_key_word>(E-?\d+))?[百千]{,1})(?:[(\(]?(?P<filter_>[%%‰折])*\s*(,?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                          "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_front_m>(E-?\d+))?(?:,?)[百千]*)())",
+                          "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_behind_m>(E-?\d+))?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
+    # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
+
+    pattern_money = re.compile("%s|%s|%s|%s" % (
+    list_money_pattern["cn"], list_money_pattern["key_word"], list_money_pattern["behind_m"],
+    list_money_pattern["front_m"]))
+
+    if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text):
+        found_yeji += 1
+    if found_yeji >= 2:  # 过滤掉业绩后面的所有金额
+        all_match = []
+    else:
+        ser = re.search('((收费标准|计算[方公]?式):|\w{3,5}\s*=)+\s*[中标投标成交金额招标人预算价格万元\s()()\[\]【】\d\.%%‰\+\-*×/]{20,}[,。]?', sentence_text)  # 过滤掉收费标准里面的金额
+        if ser:
+            all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' ' * len(ser.group(0))))
+        else:
+            all_match = re.finditer(pattern_money, sentence_text)
+    for _match in all_match:
+        # print('_match: ', _match.group())
+        if len(_match.group()) > 0:
+            # print("===",_match.group())
+            # # print(_match.groupdict())
+            notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
+            unit = ""
+            entity_text = ""
+            start_index = ""
+            end_index = ""
+            text_beforeMoney = ""
+            filter = ""
+            filter_unit = False
+            notSure = False
+            science = ""
+            if re.search('业绩(公示|汇总|及|报告|\w{,2}(内容|情况|信息)|[^\w])', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
+                # print('金额在业绩后面: ', _match.group(0))
+                found_yeji += 1
+                break
+            for k, v in _match.groupdict().items():
+                if v != "" and v is not None:
+                    if k == 'text_key_word':
+                        notSure = True
+                    if k.split("_")[0] == "money":
+                        entity_text = v
+                        # print(_match.group(k), 'entity_text: ', sentence_text[_match.start(k): _match.end(k)])
+                        if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
+                            entity_text = entity_text[:-3]
+                    if k.split("_")[0] == "unit":
+                        if v == '万元' or unit == "":  # 处理  预算金额(元):160万元 这种出现前后单位不一致情况
+                            unit = v
+                    if k.split("_")[0] == "text":
+                        # print('text_before: ', _match.group(k))
+                        text_beforeMoney = v
+                    if k.split("_")[0] == "filter":
+                        filter = v
+                    if re.search("filter_unit", k) is not None:
+                        filter_unit = True
+                    if k.split("_")[0] == 'science':
+                        science = v
+            # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text,unit,text_beforeMoney,filter,filter_unit))
+            # if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
+            #     if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0] - 2):_match.span()[0]]):
+            #         entity_text = re.sub('\d+,', '', entity_text)
+            #     else:
+            #         entity_text = entity_text.replace(',', '.')
+            #     # print(' 修正OCR识别小数点为逗号')
+
+            if filter != "":
+                continue
+            start_index, end_index = _match.span()
+            start_index += len(text_beforeMoney)
+            if unit == "":  # 2021/7/21 有明显金额特征的补充单位,避免被过滤
+                if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
+                    unit = '元'
+                    # print('1明显金额特征补充单位 元')
+                elif re.search('USD[::]?$', text_beforeMoney):
+                    unit = '美元'
+                elif re.search('EUR[::]?$', text_beforeMoney):
+                    unit = '欧元'
+                elif re.search('JPY[::]?$', text_beforeMoney):
+                    unit = '日元'
+                elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
+                    # print('两个金额连接后面的有单位,用后面单位')
+                    unit = '万元'
+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:
+                    if re.search('^[\d,,.]+$', entity_text) and re.sub('[,,.]', '', entity_text).isdigit() and float(re.sub('[,,.]', '', entity_text))<500 and re.search('万元', sentence_text):
+                        unit = '万元'
+                        # print('金额较小且句子中有万元的,补充单位为万元')
+                    else:
+                        unit = '元'
+                        # print('金额前面紧接关键词的补充单位 元')
+                elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|(^\d{,3}(,\d{3})+,?$)', entity_text):
+                    unit = '元'
+                    # print('3明显金额特征补充单位 元')
+                else:
+                    # print('过滤掉没单位金额: ',entity_text)
+                    continue
+            elif unit == '万元':
+                if end_index < len(sentence_text) and sentence_text[end_index] == '元' and re.search('\d$', entity_text):
+                    unit = '元'
+                elif re.search('^[5-9]\d{6,}\.\d{2}$', entity_text): # 五百亿以上的万元改为元
+                    unit = '元'
+            if unit.find("万") >= 0 and entity_text.find("万") >= 0:  # 2021/7/19修改为金额文本有万,不计算单位
+                # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit)
+                unit = "元"
+            if re.search('.*万元万元', entity_text):  # 2021/7/19 修正两个万元
+                # print(' 修正两个万元',entity_text)
+                entity_text = entity_text.replace('万元万元', '万元')
+            else:
+                if filter_unit:
+                    continue
+
+            # symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else ''  # 负值金额前面保留负号 ,后面这些不作为负金额 起拍价:105.29-200.46万元  预 算 --- 350000.0 2023/04/14 取消符号
+
+            entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", entity_text)
+            # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
+            if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算|投资额',
+                         sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
+                # print('总投资金额: ', _match.group(0))
+                notes = '总投资'
+            elif re.search('投资|概算',
+                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
+                notes = '投资'
+            elif re.search('工程造价',
+                           sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
+                notes = '工程造价'
+            elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
+                  or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\((]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\))]*[::为]',
+                               sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
+                  or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
+                               sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
+                notes = '保证金'
+                # print('保证金信息:', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
+            elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
+                           sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
+                notes = '成本警戒线'
+            elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]', sentence_text[_match.span()[0]:_match.span()[1]]):
+                cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
+                notes = cost_re.group(1)
+            elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
+                notes = '单价'
+            elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
+                notes = '大写'
+                if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
+                    entity_text = "壹" + entity_text
+                # print("补充备注:notes = 大写")
+            if len(unit) > 0:
+                if unit.find('万') >= 0 and len(entity_text.split('.')[0]) >= 8:  # 2021/7/19 修正万元金额过大的情况
+                    # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
+                    entity_text = str(
+                        getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]) / 10000)
+                    unit = '元'  # 修正金额后单位 重置为元
+                else:
+                    # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
+                    entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]", "", unit)[0]))
+            else:
+                if entity_text.find('万') >= 0 and entity_text.split('.')[0].isdigit() and len(
+                        entity_text.split('.')[0]) >= 8:
+                    entity_text = str(getUnifyMoney(entity_text) / 10000)
+                    # print('修正金额字段含万 过大的情况')
+                else:
+                    entity_text = str(getUnifyMoney(entity_text))
+            if science and re.search('^E-?\d+$', science):  # 科学计数
+                entity_text = str(Decimal(entity_text + science)) if Decimal(entity_text + science) > 100 and Decimal(
+                    entity_text + science) < 10000000000 else entity_text  # 结果大于100及小于100万才使用科学计算
+
+            if float(entity_text) > 100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
+                # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
+                continue
+
+            if notSure and unit == "" and float(entity_text) > 100 * 10000:
+                # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
+                continue
+            # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
+            #                                                                      filter, filter_unit))
+            money_list.append((entity_text, start_index, end_index, unit, notes))
+    return money_list, found_yeji
+
 def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
     '''
 
@@ -2605,193 +2792,34 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             second2last.is_tail = True
 
             #使用正则识别金额
-            entity_type = "money"
-            list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>-*[0-9][\d,]*(?:\.\d+)?(?P<science_key_word>(E-?\d+))?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
-                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>-*[0-9][\d,]*(?:\.\d+)?(?P<science_front_m>(E-?\d+))?(?:,?)[百千]*)())",
-                                  "behind_m":"(()()(?P<money_behind_m>-*[0-9][\d,]*(?:\.\d+)?(?P<science_behind_m>(E-?\d+))?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
-            # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
-
-            pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
-
-            # if re.search('评标结果|候选人公示', sentence_text):
-            #     found_pingbiao = True
-            if re.search('业绩', sentence_text):
-                found_yeji += 1
-            if found_yeji >= 2: # 过滤掉业绩后面的所有金额
-                all_match = []
-            else:
-                ser = re.search('(收费标准|计算方式):\w{3,5}=[中标成交金额价格万元()\d%+*.-\[\]]+[,。]', sentence_text)  # 过滤掉收费标准里面的金额
-                if ser:
-                    all_match = re.finditer(pattern_money, sentence_text.replace(ser.group(0), ' '*len(ser.group(0))))
-                else:
-                    all_match = re.finditer(pattern_money, sentence_text)
-            index = 0
-            for _match in all_match:
-                if len(_match.group())>0:
-                    # print("===",_match.group())
-                    # # print(_match.groupdict())
-                    notes = ''  # 2021/7/20 新增备注金额大写或金额单位 if 金额大写 notes=大写 elif 单位 notes=单位
-                    unit = ""
-                    entity_text = ""
-                    text_beforeMoney = ""
-                    filter = ""
-                    filter_unit = False
-                    notSure = False
-                    science = ""
-                    if re.search('业绩', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
-                        # print('金额在业绩后面: ', _match.group(0))
-                        found_yeji += 1
-                        break
-                    if (re.search('电话|编码|编号|号码|日期|时间|账号', sentence_text[max(0, _match.start()-12): _match.end()]) or re.search('^[a-zA-Z0-9+-]', sentence_text[_match.end():])) and re.search('[元¥¥]', _match.group(0)) == None:
-                        continue
 
-                    for k,v in _match.groupdict().items():
-                        if v!="" and v is not None:
-                            if k=='text_key_word':
-                                notSure = True
-                            if k.split("_")[0]=="money":
-                                entity_text = v
-                                if entity_text.endswith(',00'): # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
-                                    entity_text = entity_text[:-3]
-                            if k.split("_")[0]=="unit":
-                                if v=='万元' or unit=="":  # 处理  预算金额(元):160万元 这种出现前后单位不一致情况
-                                    unit = v
-                            if k.split("_")[0]=="text":
-                                text_beforeMoney = v
-                            if k.split("_")[0]=="filter":
-                                filter = v
-                            if re.search("filter_unit",k) is not None:
-                                filter_unit = True
-                            if k.split("_")[0] == 'science':
-                                science = v
-                    # print(_match.group())
-                    # print(entity_text,unit,text_beforeMoney,filter,filter_unit)
-                    if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
-                        if re.search('[幢栋号楼层]', sentence_text[max(0, _match.span()[0]-2):_match.span()[0]]):
-                             entity_text = re.sub('\d+,', '', entity_text)
-                        else:
-                            entity_text = entity_text.replace(',', '.')
-                        # print(' 修正OCR识别小数点为逗号')
-
-                    if entity_text.find("元")>=0:
-                        unit = ""
-                    if unit == "":  #2021/7/21 有明显金额特征的补充单位,避免被过滤
-                        if ('¥' in text_beforeMoney or '¥' in text_beforeMoney):
-                            unit = '元'
-                            # print('明显金额特征补充单位 元')
-                        elif re.search('[单报标限]价|金额|价格|(监理|设计|勘察)(服务)?费[::为]+$', text_beforeMoney.strip()) and \
-                                re.search('\d{5,}',entity_text) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}',entity_text)==None:
-                            unit = '元'
-                            # print('明显金额特征补充单位 元')
-                        elif re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|(^\d{,3}(,\d{3})+,?$)',entity_text):
-                            unit = '元'
-                            # print('明显金额特征补充单位 元')
-                    if unit.find("万") >= 0 and entity_text.find("万") >= 0:  #2021/7/19修改为金额文本有万,不计算单位
-                        # print('修正金额及单位都有万, 金额:',entity_text, '单位:',unit)
-                        unit = "元"
-                    if re.search('.*万元万元', entity_text):  #2021/7/19 修正两个万元
-                        # print(' 修正两个万元',entity_text)
-                        entity_text = entity_text.replace('万元万元','万元')
-                    else:
-                        if filter_unit:
-                            continue
-                    if filter!="":
-                        continue
-
-                    index = _match.span()[0]+len(text_beforeMoney)
-                    begin_index_temp = index
-                    for j in range(len(list_tokenbegin)):
-                        if list_tokenbegin[j]==index:
-                            begin_index = j
-                            break
-                        elif list_tokenbegin[j]>index:
-                            begin_index = j-1
-                            break
-                    index = _match.span()[1]
-                    end_index_temp = index
-                    #index += len(str(all_match[i][0]))
-                    for j in range(begin_index,len(list_tokenbegin)):
-                        if list_tokenbegin[j]>=index:
-                            end_index = j-1
-                            break
-                    entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
-
-                    symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else ''  # 负值金额前面保留负号 ,后面这些不作为负金额 起拍价:105.29-200.46万元  预 算 --- 350000.0
-
-                    entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
-                    # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
-                    if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
-                        # print('总投资金额: ', _match.group(0))
-                        notes = '总投资'
-                    elif re.search('投资|概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
-                        notes = '投资'
-                    elif re.search('工程造价', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/12/20 工程造价不作为招标金额
-                        notes = '工程造价'
-                    elif (re.search('保证金', sentence_text[max(0, _match.span()[0] - 5):_match.span()[1]])
-                          or re.search('保证金的?(缴纳)?(金额|金\?|额|\?)?[\((]*(万?元|为?人民币|大写|调整|变更|已?修改|更改|更正)?[\))]*[::为]',
-                                       sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]])
-                          or re.search('保证金由[\d.,]+.{,3}(变更|修改|更改|更正|调整?)为',
-                                       sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])):
-                        notes = '保证金'
-                        # print('保证金信息:', sentence_text[max(0, _match.span()[0] - 15):_match.span()[1]])
-                    elif re.search('成本(警戒|预警)(线|价|值)[^0-9元]{,10}',
-                                   sentence_text[max(0, _match.span()[0] - 10):_match.span()[0]]):
-                        notes = '成本警戒线'
-                    elif re.search('(监理|设计|勘察)(服务)?费(报价)?[约为:]', sentence_text[_match.span()[0]:_match.span()[1]]):
-                        cost_re = re.search('(监理|设计|勘察)(服务)?费', sentence_text[_match.span()[0]:_match.span()[1]])
-                        notes = cost_re.group(1)
-                    elif re.search('单价|总金额', sentence_text[_match.span()[0]:_match.span()[1]]):
-                        notes = '单价'
-                    elif re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆]', entity_text) != None:
-                        notes = '大写'
-                        if entity_text[0] == "拾":  # 2021/12/16 修正大写金额省略了数字转换错误问题
-                            entity_text = "壹"+entity_text
-                        # print("补充备注:notes = 大写")
-                    if len(unit)>0:
-                        if unit.find('万')>=0 and len(entity_text.split('.')[0])>=8: # 2021/7/19 修正万元金额过大的情况
-                            # print('修正单位万元金额过大的情况 金额:', entity_text, '单位:', unit)
-                            entity_text = str(getUnifyMoney(entity_text) * getMultipleFactor(re.sub("[美日欧]","",unit)[0])/10000)
-                            unit = '元' # 修正金额后单位 重置为元
-                        else:
-                            # print('str(getUnifyMoney(entity_text)*getMultipleFactor(unit[0])):')
-                            entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(re.sub("[美日欧]","",unit)[0]))
-                    else:
-                        if entity_text.find('万')>=0 and entity_text.split('.')[0].isdigit() and len(entity_text.split('.')[0])>=8:
-                            entity_text = str(getUnifyMoney(entity_text)/10000)
-                            # print('修正金额字段含万 过大的情况')
-                        else:
-                            entity_text = str(getUnifyMoney(entity_text))
-                    if science and re.search('^E-?\d+$', science):  # 科学计数
-                        entity_text = str(Decimal(entity_text+science)) if Decimal(entity_text+science) > 100 and Decimal(entity_text+science) < 10000000000 else entity_text # 结果大于100及小于100万才使用科学计算
-
-                    if float(entity_text)>100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
-                        # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
-                        continue
-
-                    if notSure and unit=="" and float(entity_text)>100*10000:
-                        # print('过滤掉金额 notSure and unit=="" and float(entity_text)>100*10000:', entity_text, unit)
-                        continue
-
-
-                    _exists = False
-                    for item in list_sentence_entitys:
-                        if item.entity_id==entity_id and item.entity_type==entity_type:
-                            _exists = True
-                        if (begin_index >=item.begin_index and begin_index<=item.end_index) or (end_index>=item.begin_index and end_index<=item.end_index):
-                            _exists = True
-                    if not _exists:
-                        if float(entity_text)>1:
-                            if symbol == '-': # 负值金额保留负号
-                                entity_text = '-'+entity_text
-                            list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp,in_attachment=in_attachment))
-                            list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
-                            list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
-                            # print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
-                            # print(entity_text,unit,notes)
-
-                else:
-                    index += 1
+            money_list, found_yeji = get_money_entity(sentence_text, found_yeji)
+            entity_type = "money"
+            for money in money_list:
+                # print('money: ', money)
+                entity_text, begin_index, end_index, unit, notes = money
+                end_index = end_index - 1 if entity_text.endswith(',') else end_index
+                entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
+                _exists = False
+                for item in list_sentence_entitys:
+                    if item.entity_id==entity_id and item.entity_type==entity_type:
+                        _exists = True
+                    if (begin_index >=item.wordOffset_begin and begin_index<item.wordOffset_end) or (end_index>item.wordOffset_begin and end_index<=item.wordOffset_end):
+                        _exists = True
+                        # print('_exists: ',begin_index, end_index, item.wordOffset_begin, item.wordOffset_end, item.entity_text, item.entity_type)
+                if not _exists:
+                    if float(entity_text)>1:
+                        # if symbol == '-': # 负值金额保留负号
+                        #     entity_text = '-'+entity_text   # 20230414 取消符号
+                        begin_words = changeIndexFromWordToWords(tokens, begin_index)
+                        end_words = changeIndexFromWordToWords(tokens, end_index)
+                        # print('金额位置: ', begin_index, begin_words,end_index, end_words)
+                        # print('金额召回: ', entity_text, sentence_text[begin_index:end_index], tokens[begin_words:end_words])
+                        list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_words,end_words,begin_index,end_index,in_attachment=in_attachment))
+                        list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
+                        list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
+                        # print('预处理中的 金额:%s, 单位:%s'%(entity_text,unit))
+                        # print(entity_text,unit,notes)
 
             # "联系人"正则补充提取  2021/11/15 新增
             list_person_text = [entity.entity_text for entity in list_sentence_entitys if entity.entity_type=='person']

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -323,7 +323,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-04-07'}
+    version_date = {'version_date': '2023-04-23'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise

+ 15 - 238
BiddingKG/dl/interface/getAttributes.py

@@ -562,233 +562,6 @@ def getPackageScopePattern():
     return pattern
         
 pattern_packageScope = getPackageScopePattern()   
-def getPackagesFromArticle_backup(list_sentence,list_entity):
-    '''
-    @param:
-        list_sentence:文章的句子list
-    @summary: 将包的信息插入list_entity中
-    @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
-    '''
-    
-    if len(list_sentence)==0:
-        return None
-    list_sentence.sort(key=lambda x:x.sentence_index)
-
-    PackageList = []
-    PackageList_scope = []
-    PackageSet = set()
-    dict_packageCode = dict()
-    
-    package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[::]?([^::]{3,30}?),{1}")
-    package_N_name_pattern = re.compile("(([^承]|^)分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[::]?[\((]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2},{1}")
-    package_number_pattern = re.compile("(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|([^\.]?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(标[段号的包]))")  # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
-    # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[::](.{,20}?)(,|项目)')  # 新正则识别标段
-    other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]')  #  # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
-    win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]')  # 2020/11/23 大网站规则 调整
-    model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]')  # 2020/11/23 大网站规则 调整
-    number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
-
-    package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
-    # 纯数字类型的包号统一,例如:'01','1'
-    re_digital = re.compile("^\d+$")
-    def changeIndexFromWordToWords(tokens,word_index):
-        '''
-        @summary:转换某个字的字偏移为词偏移
-        '''
-        before_index = 0
-        after_index = 0
-        for i in range(len(tokens)):
-            after_index = after_index+len(tokens[i])
-            if before_index<=word_index and after_index>=word_index:
-                return i
-            before_index = after_index
-    package_names = []
-    
-    def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
-        '''
-        @summary:抽取包附近的标段号
-        @param:
-            tokens:包所在句子的分词
-            word_index:包所在字偏移
-            size:左右各取多少个词
-            pattern:提取标段号的正则
-        @return: type:string,meaning:标段号
-        '''
-        index = changeIndexFromWordToWords(tokens,word_index)
-        if index<size:
-            begin = index
-        else:
-            begin = index-size
-        if index+size>len(tokens):
-            end = len(tokens)
-        else:
-            end = index+size
-        #拿到左右两边的词语组成短语
-        text = "".join(tokens[begin:end])
-        #在短语中的字偏移
-        new_word_index = word_index-len("".join(tokens[:begin]))
-        min_distance = len(text)
-        packageCode = None
-        for the_iter in re.finditer(pattern,text):
-            #算出最小距离
-            distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
-            if distance<min_distance:
-                min_distance = distance
-                packageCode = the_iter.group(1)
-        return packageCode
-    #从标段介绍表格中提取包名和包号
-    for i in range(len(list_sentence)):
-        content = list_sentence[i].sentence_text
-        names = re.findall(package_name_pattern,content)
-        if names == []:
-            names = re.findall(other_package_pattern, content)
-        N_names = re.findall(package_N_name_pattern,content)
-        if len(names)==1 and len(N_names)==1:
-            package_names.append([names[0][-1],N_names[0][-1]])
-    for i in range(len(list_sentence)):
-        PackageList_item = []
-        PackageList_item_scope = []
-        content = list_sentence[i].sentence_text
-        tokens = list_sentence[i].tokens
-        _names = []
-        # 2021/6/23 包名称去重
-        for name in package_names:
-            if name not in _names:
-                _names.append(name)
-        # for name in package_names[:20]:
-        for name in _names[:20]:
-            for index in findAllIndex(name[0],content):
-                temp_package_number = re.findall(number_pattern,name[1])[0]
-                if re.search(re_digital,temp_package_number):
-                    temp_package_number = str(int(temp_package_number))
-                PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
-                # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
-                code = extractPackageCode(tokens, index)
-                if code is not None:
-                    dict_packageCode[temp_package_number] = code
-                PackageSet.add(temp_package_number)
-        for iter in re.finditer(package_number_pattern,content):
-            if re.match('\d', iter.group(0)) and iter.end()<len(content) and content[iter.end()].isdigit():  # 排除2.10标段3 这种情况
-                continue
-            temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
-            if re.search(re_digital, temp_package_number):
-                temp_package_number = str(int(temp_package_number))
-            PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
-            # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
-            code = extractPackageCode(tokens, iter.span()[0])
-            if code is not None:
-                dict_packageCode[temp_package_number] = code
-            PackageSet.add(temp_package_number)
-        
-        #识别packageScope
-        for iter in re.finditer(pattern_packageScope,content):
-            PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
-            # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
-        PackageList_item_scope = PackageList_item +PackageList_item_scope
-        PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
-        PackageList_scope = PackageList_scope+PackageList_item_scope
-        PackageList_item.sort(key=lambda x:x["sentence_index"])
-        #PackageList = PackageList+PackageList_item
-    #不作为包
-    # if len(PackageSet)==0:
-    #     for i in range(len(list_sentence)):
-    #         PackageList_item = []
-    #         PackageList_item_scope = []
-    #         content = list_sentence[i].sentence_text
-    #         tokens = list_sentence[i].tokens
-    #         for iter in re.finditer(other_package_pattern,content):
-    #             temp_package_number = iter.group(2)
-    #             PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
-    #             # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
-    #             code = extractPackageCode(tokens, iter.span()[0])
-    #             if code is not None:
-    #                 dict_packageCode[temp_package_number] = code
-    #             PackageSet.add(temp_package_number)
-    #         #识别packageScope
-    #         for iter in re.finditer(pattern_packageScope,content):
-    #             PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
-    #             # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
-    #         PackageList_item_scope = PackageList_item +PackageList_item_scope
-    #         PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
-    #         PackageList_scope = PackageList_scope+PackageList_item_scope
-    #         PackageList_item.sort(key=lambda x:x["sentence_index"])
-
-    # 2020/11/23 大网站规则 调整
-    if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
-        for i in range(len(list_sentence)):
-            PackageList_item = []
-            PackageList_item_scope = []
-            content = list_sentence[i].sentence_text
-            tokens = list_sentence[i].tokens
-            names = re.findall(other_package_pattern, content)
-            N_names = re.findall(win_tenderer_pattern, content)
-            if len(names) != 1 or len(N_names) != 1:
-                continue
-            for iter in re.finditer(other_package_pattern,content):
-                temp_package_number = iter.group(4)
-                xinghao = re.search(model_pattern, content)
-                if xinghao:
-                    temp_package_number = temp_package_number + '+' + xinghao.group(2)
-                # print('新正则采购包名补充',temp_package_number)
-                if re.search(re_digital,temp_package_number):
-                    temp_package_number = str(int(temp_package_number))
-                PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
-                # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
-                code = extractPackageCode(tokens, iter.span()[0])
-                if code is not None:
-                    dict_packageCode[temp_package_number] = code
-                PackageSet.add(temp_package_number)
-            #识别packageScope
-            for iter in re.finditer(pattern_packageScope,content):
-                PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
-                # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
-            PackageList_item_scope = PackageList_item +PackageList_item_scope
-            PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
-            PackageList_scope = PackageList_scope+PackageList_item_scope
-            PackageList_item.sort(key=lambda x:x["sentence_index"])
-    pattern_punctuation = "[::()\(\),,。;;]"
-  # print("===packageList_scope",PackageList_scope)
-    for i in range(len(list_sentence)):
-        for j in range(len(PackageList_scope)):
-            if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
-                _flag = False
-                left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
-                right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
-                _left_find = re.findall(pattern_punctuation,left_str)
-                _right_find = re.findall(pattern_punctuation,right_str)
-                #print(left_str)
-                if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
-                    continue
-                if re.search("划分",right_str[:10]) is not None:
-                    continue
-                if len(_left_find)>0 and _left_find[-1] in [":",":"]:
-                    _flag = True
-                if len(_right_find)>0 and _right_find[0] in [":",":"]:
-                    _flag = True
-                if _flag:
-                    scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
-                else:
-                    if j==0:
-                        scope_begin = [0,0]
-                    else:
-                        scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
-                if j==len(PackageList_scope)-1:
-                    scope_end = [list_sentence[-1].sentence_index,changeIndexFromWordToWords(list_sentence[-1].tokens, len(list_sentence[-1].sentence_text))]
-                else:
-                    scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
-                if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
-                    continue
-
-                #add package to entity
-                _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"],in_attachment=list_sentence[i].in_attachment)
-                list_entity.append(_pack_entity)
-                copy_pack = copy.copy(PackageList_scope[j])
-                copy_pack["scope"] = [scope_begin,scope_end]
-                copy_pack["hit"] = set()
-                copy_pack["pointer"] = _pack_entity
-
-                PackageList.append(copy_pack)
-    return PackageList,PackageSet,dict_packageCode
 
 def getPackagesFromArticle(list_sentence, list_entity):
     '''
@@ -811,7 +584,7 @@ def getPackagesFromArticle(list_sentence, list_entity):
     # '((施工|监理|监测|勘察|设计|劳务)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标[段包]?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]')  # 标号
 
     package_number_pattern = re.compile(
-        '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
+        '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
 |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)[分子]?(标[段包项]?|合同[包段]))\
 |(([,;。、:(]|第)?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
 |((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*))\
@@ -3641,15 +3414,16 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
                 money_text = ser.group('money')
                 header = ser.group('header')
                 money, money_unit = money_process(money_text, header)
-                for value in prem[0]['prem'].values():
-                    for l in value['roleList']:
-                        try: # 如果原中标金额为0 或 金额小于合计金额0.1倍且正文没中标金额关键词 替换为 合计金额
-                            if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money'])==0 or (float(l["role_money"]['money']) < money / 10 and re.search('(中标|成交|合同)(总?金额|[单报总]?价)', content) == None)):
-                                l["role_money"]['money'] = str(money)
-                                l["role_money"]['money_unit'] = money_unit
-                                # print('修改中标金额为总价或合计金额')
-                        except Exception as e:
-                            print('修正中标价格报错:%s' % e)
+                if 100<money<8000000:
+                    for value in prem[0]['prem'].values():
+                        for l in value['roleList']:
+                            try: # 如果原中标金额为0 或 金额小于合计金额0.1倍且正文没中标金额关键词 替换为 合计金额
+                                if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money'])==0 or (float(l["role_money"]['money']) < money / 10 and re.search('(中标|成交|合同)(总?金额|[单报总]?价)', content) == None)):
+                                    l["role_money"]['money'] = str(money)
+                                    l["role_money"]['money_unit'] = money_unit
+                                    # print('修改中标金额为总价或合计金额')
+                            except Exception as e:
+                                print('修正中标价格报错:%s' % e)
 
 def limit_maximum_amount(prem, industry):
     indu = industry['industry'].get('class_name', '')
@@ -3817,7 +3591,10 @@ def update_prem(old_prem, new_prem):
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem[k]['roleList'].append(d2)
-
+        if len(old_prem)>1 and 'Project' in old_prem:
+            for d in old_prem['Project']['roleList']:
+                if d['role_name'] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
+                    old_prem['Project']['roleList'].remove(d) # 提取到其他包,去掉 project 里面的中标角色
     # return old_prem
 
 def fix_single_source(prem, channel_dic, original_docchannel):

BIN
BiddingKG/dl/interface/header_set.pkl


+ 127 - 48
BiddingKG/dl/interface/predictor.py

@@ -348,6 +348,7 @@ class CodeNamePredict():
                     # print(join_predict)
                     code_x = []
                     code_text = []
+                    pre_text = []
                     temp_entitys = []
                     for iter in re.finditer(self.PC_pattern,join_predict):
                         get_len = 40
@@ -358,6 +359,7 @@ class CodeNamePredict():
                         end = iter.span()[1]+get_len
                         code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
                         code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",", ""))
+                        pre_text.append(pad_sentence[begin:iter.span()[0]])
                         _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
                         temp_entitys.append(_entity)
                     #print("code",code_text)
@@ -402,20 +404,52 @@ class CodeNamePredict():
                                         if len(it) > 8:
                                             if it not in code_set:
                                                 code_set.add(it)
-                                                item['code'].append(it)
+                                                # item['code'].append(it)
+                                                if re.search("(项目编号|招标编号):?$", pre_text[h]):
+                                                    item['code'].append((it, 0))
+                                                elif re.search('采购(计划)?编号:?$', pre_text[h]):
+                                                    item['code'].append((it, 1))
+                                                elif re.search('(询价|合同)编号:?$', pre_text[h]):
+                                                    item['code'].append((it, 2))
+                                                else:
+                                                    item['code'].append((it, 3))
                                         elif len(item['code']) > 0:
-                                            new_it = item['code'][-1] + re.search(',|/|;|、|,', the_code).group(0) + it
+                                            new_it = item['code'][-1][0] + re.search(',|/|;|、|,', the_code).group(0) + it
                                             if new_it not in code_set:
                                                 code_set.add(new_it)
-                                                item['code'][-1] = new_it
+                                                # item['code'][-1] = new_it
+                                                if re.search("(项目编号|招标编号):?$", pre_text[h]):
+                                                    item['code'][-1] = (new_it, 0)
+                                                elif re.search('采购(计划)?编号:?$', pre_text[h]):
+                                                    item['code'][-1] = (new_it, 1)
+                                                elif re.search('(询价|合同)编号:?$', pre_text[h]):
+                                                    item['code'][-1] = (new_it, 2)
+                                                else:
+                                                    item['code'][-1] = (new_it, 3)
                                         else:
                                             if the_code not in code_set:
                                                 code_set.add(the_code)
-                                                item['code'].append(the_code)
+                                                # item['code'].append(the_code)
+                                                if re.search("(项目编号|招标编号):?$", pre_text[h]):
+                                                    item['code'].append((the_code, 0))
+                                                elif re.search('采购(计划)?编号:?$', pre_text[h]):
+                                                    item['code'].append((the_code, 1))
+                                                elif re.search('(询价|合同)编号:?$', pre_text[h]):
+                                                    item['code'].append((the_code, 2))
+                                                else:
+                                                    item['code'].append((the_code, 3))
                                             break
                                 elif the_code not in code_set:
                                     code_set.add(the_code)
-                                    item['code'].append(the_code)
+                                    # item['code'].append(the_code)
+                                    if re.search("(项目编号|招标编号):?$", pre_text[h]):
+                                        item['code'].append((the_code, 0))
+                                    elif re.search('采购(计划)?编号:?$', pre_text[h]):
+                                        item['code'].append((the_code, 1))
+                                    elif re.search('(询价|合同)编号:?$', pre_text[h]):
+                                        item['code'].append((the_code, 2))
+                                    else:
+                                        item['code'].append((the_code, 3))
 
                                 # if the_code not in code_set:
                                 #     code_set.add(the_code)
@@ -511,10 +545,21 @@ class CodeNamePredict():
                     # 2020/11/23 大网站规则调整
                     othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
                     if othercode != None:
-                        item['code'].append(othercode.group('code'))
+                        # item['code'].append(othercode.group('code'))
+                        if re.search("(项目编号|招标编号):?$", othercode.group(0)):
+                            item['code'].append((othercode.group('code'), 0))
+                        elif re.search('采购(计划)?编号:?$', othercode.group(0)):
+                            item['code'].append((othercode.group('code'), 1))
+                        elif re.search('(询价|合同)编号:?$', othercode.group(0)):
+                            item['code'].append((othercode.group('code'), 2))
+                        else:
+                            item['code'].append((othercode.group('code'), 3))
                         # print('规则召回项目编号:', othercode.group('code'))
-            item['code'] = [code for code in item['code'] if len(code)<500]
-            item['code'].sort(key=lambda x:len(x),reverse=True)
+            # item['code'] = [code for code in item['code'] if len(code)<500]
+            # item['code'].sort(key=lambda x:len(x),reverse=True)
+            item['code'] = [code for code in item['code'] if len(code[0]) < 500]
+            item['code'].sort(key=lambda x: x[1])
+            item['code'] = [it[0] for it in item['code']]
             result.append(item)
 
             list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
@@ -728,7 +773,7 @@ class PREMPredict():
             text_tup = text_list[i]
             front, middle, behind = text_tup
             whole = "".join(text_tup)
-            # print('模型预测角色:', front, entity.entity_text, label, values)
+            # print('模型预测角色:', front, entity.entity_text, behind,label, values)
             if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
                 label = 5
             elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
@@ -755,7 +800,7 @@ class PREMPredict():
             elif label == 1 and re.search('委托(单位|人|方)[是为:]+',front) and re.search('受委托(单位|人|方)[是为:]+', front)==None:
                 label = 0
                 values[label] = 0.501
-            elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', front):
+            elif label == 1 and re.search('([,。:]|^)(第一)?(服务|中选|中标)(中介服务|代理)?(公司|机构)(名称)?', front):
                 label = 2
                 values[label] = 0.501
             elif label in [3,4] and re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
@@ -814,14 +859,16 @@ class PREMPredict():
                     values[label] = 0.49
                 elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
                     values[label] = 0.49
-                elif re.search('^(以上)?按[\d.%]+收取|^[+×*-][\d.%]+', behind):
+                elif re.search('^(以[下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
                     values[label] = 0.49
-                elif re.search('(含|在|包括)$', front):
+                elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
                     values[label] = 0.49
             elif label ==0: # 错误招标金额处理
-                if entity.notes in ["投资", "工程造价"] or re.search('最低限价:?$', front):
+                if entity.notes in ["投资", "总投资","工程造价"] or re.search('最低限价:?$', front) or re.search('服务内容:([\d,.]+万?亿?元?-?)$', front):
+                    values[label] = 0.49
+                elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
                     values[label] = 0.49
-                elif re.search('(含|在|包括)$', front):
+                elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
                     values[label] = 0.49
             elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
                 label = 1
@@ -1251,18 +1298,24 @@ class RoleRulePredictor():
         self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
-        self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|承租|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
-                                        "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
-                                        "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施|合作)(机构|单位|商|方)(名称)?[::是为]+$)"
-        self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
-        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交|入选)(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
-        # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
-        # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
-        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
-                                        "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^[成作]?为([\w、()()]+|本|此|该)项目的?(成交|中选|中标|服务)(供应商|单位|人)|^[((](中标|成交|承包)人名?称?[))]))"
-        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|决定由.{5,20}承办|(谈判结果:|确定)由.{5,20}(向我单位)?供货|中标通知书.{,15}你方|单一来源从[()\w]{5,20}采购)"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
-
-        # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
+        self.pattern_winTenderer_left = "(?P<winTenderer_left>" \
+               "(乙|竞得|受让|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承租((包))?)(候选)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
+               "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致)[::是为]+$" \
+               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$" \
+               "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$" \
+               "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$)"  # 承办单位:不作为中标 83914772
+        self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w0>" \
+                                           "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|第?[一1]名|第一(中标)?候选人)" \
+                                           "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$)"  # 解决表头识别不到加逗号情况,需前面为,。空
+        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)" \
+                                           "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$)"  # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标
+
+        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|厂商)))|" \
+                                         "^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
+                                         "|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格" \
+                                         "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))"
+        self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标" \
+                                         "|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购)"  # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
 
         self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
         self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
@@ -1270,6 +1323,8 @@ class RoleRulePredictor():
         self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
 
+        self.condadate_left = "(?P<candidate_left>((中标|成交|入围)候选(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)[::是为]+$)"
+
         self.pattern_whole = [self.pattern_tenderee_left_w1,
                               self.pattern_tenderee_left,
                               self.pattern_tenderee_left_w0,
@@ -1318,6 +1373,8 @@ class RoleRulePredictor():
             list_sentence.sort(key=lambda x: x.sentence_index)  # 2022/1/5 按句子顺序排序
             # list_name = list_codename["name"]
             list_name = []  # 2022/1/5  改为实体列表内所有项目名称
+            candidates = [] # 保存不能确定为第几的候选人 2023/04/14
+            notfound_tenderer = True  # 未找到前三候选人
             for entity in list_entity:
                 if entity.entity_type == 'name':
                     list_name.append(entity.entity_text)
@@ -1385,12 +1442,6 @@ class RoleRulePredictor():
                                         break
                             if find_flag:
                                 break
-                            # if str(_name).find(p_entity.entity_text)>=0:
-                            #     find_flag = True
-                            #     _label = 0
-                            #     p_entity.label = _label
-                            #     p_entity.values[int(_label)] = on_value
-                            #     break
                         # 若是实体在标题中,默认为招标人,不进行以下的规则匹配
                         if find_flag:
                             continue
@@ -1412,6 +1463,7 @@ class RoleRulePredictor():
                                                                   list_sentence[s_index].sentence_text.replace('(', '').replace(')', '')[:100]):
                                         p_entity.label = 2
                                         p_entity.values[2] = 0.5
+                                        notfound_tenderer = False
                                         # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group,  _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
                                         break
                                 except Exception as e:
@@ -1443,7 +1495,7 @@ class RoleRulePredictor():
                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
                                                                                                         list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
@@ -1451,7 +1503,7 @@ class RoleRulePredictor():
                                                         _prob_weight = 1.2 if _weight=='w1' else 1
                                                         # print('_v_group:',_group, _v_group, p_entity.entity_text)
 
-                                                    if _i_span == 1 and _direct == "center":
+                                                    if _i_span == 1 and _direct == "center" and _v_group.find(p_entity.entity_text) != -1 and re.search('以[^,。;]{10,30}为准', list_spans[1])==None:
                                                         _flag = True
                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                                   "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
@@ -1465,12 +1517,19 @@ class RoleRulePredictor():
                                                         _prob_weight = 1.2 if _weight == 'w1' else 1
                                                         # print('_v_group:', _group, _v_group, p_entity.entity_text)
 
-                                    # 得到结果
+                                        # 得到结果
                                     if _flag:
+                                        if _label in [2, 3, 4]:
+                                            notfound_tenderer = False
                                         p_entity.label = _label
                                         p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
                                         # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group,  _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
                                         break
+                                    if _i_span == 0 and  re.search(self.condadate_left, list_spans[_i_span]):
+                                        candidates.append(p_entity)
+
+                    elif str(p_entity.label) in ['2', '3', '4']:
+                        notfound_tenderer = False
 
                 # 其他金额通过正则召回可能是招标或中投标的金额
                 if p_entity.entity_type in ["money"]:
@@ -1509,6 +1568,14 @@ class RoleRulePredictor():
                                     p_entity.label = 0
                                     # print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
 
+            if notfound_tenderer and len(candidates) == 1 and re.search(
+                    '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
+                    article.content[:100]):
+                for p_entity in candidates:
+                    # print('只有一个候选人的作为中标人', p_entity.entity_text)
+                    p_entity.label = 2
+                    p_entity.values[2] = on_value
+
             # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
             list_p = []
             state = 0
@@ -2860,7 +2927,9 @@ class ProductAttributesPredictor():
                             if re.search('^\w{1,4}$', tds[id2_2]):
                                 quantity_unit = tds[id2_2]
                         if id3 != "":
-                            if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
+                            if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
+                                unitPrice = tds[id3]
+                            elif re.search('^[\d,.亿万元人民币欧美日金额:()()]+$', tds[id3].strip()):
                                 unitPrice = tds[id3]
                                 # _unitPrice = tds[id3]
                                 # re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
@@ -3254,11 +3323,11 @@ class DocChannel():
           '公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
           '公告变更neg': '履约变更内容',
           '候选人公示': '候选人公示|评标结果公示|中标候选人名单公示|现将中标候选人(进行公示|公[示布]如下)|(中标|中选)候选人(信息|情况)[::\s]',
-          '候选人公示neg': '中标候选人公示期',
+          '候选人公示neg': '中标候选人公示期|中标候选人公示前',
           '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(人|成交)|成交)\w{,3}(信息|情况)[::\s]',
           '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源(采购|招标)?的?(中标|成交|结果)', # |单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示
           '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]', # |唯一
-          '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家|通知中标单位',
+          '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家|通知中标单位|影响(成交|中标)结果',
       # |确定成交供应商[:,\s]
           '合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|完成(日期|时间))|(供应商乙方|乙方供应商):|合同总?金额|履约信息',
           '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
@@ -4831,8 +4900,8 @@ class TablePremExtractor(object):
         self.head_rule_dic = {
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
-            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
-            "win_sort": "是否(中标|成交)|排名|排序|名次|未(中标|成交)原因",
+            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的)(名称?|内容)",
+            "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因",
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
@@ -4918,7 +4987,7 @@ class TablePremExtractor(object):
         roles = []
         if ners:
             for ner in ners[0]:
-                if ner[2] in ['org', 'company']:
+                if ner[2] in ['org', 'company', 'location']:
                     roles.append(ner[3])
         if roles and len(''.join(roles)) > len(text)*0.8:
             return roles[0]
@@ -4931,6 +5000,8 @@ class TablePremExtractor(object):
         multi_same_package = False # 非连续的重复包号
         package_fix2raw = dict()  # 处理后包号:处理前包号 字典
         link_set = set()
+        not_package = True if 'project_name' in headers and re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
+                          'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
         for i in df.index:
             same_package = False  # 连续重复包号,一般是 rowspan 造成;一包 多个采购
             project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
@@ -4943,8 +5014,10 @@ class TablePremExtractor(object):
             win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
 
             if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset != set(): # 只要有一项为表头 停止匹配
+                # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
                 break
             if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2:  # 内容为空或全部一样 停止匹配
+                # print('内容为空或全部一样 停止匹配')
                 break
             if re.search('详见', project_name):  # 去除某些表达: 详见招标文件
                 project_name = ""
@@ -4962,7 +5035,7 @@ class TablePremExtractor(object):
 
             if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
                 continue
-            if win_sort != "" and re.search('是否(中标|成交)', headers['win_sort'][1]) and re.search('否|未(中标|成交)', win_sort):
+            if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
                 continue
             if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
                 continue
@@ -4977,9 +5050,14 @@ class TablePremExtractor(object):
 
             if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
                 break
-            if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
-                continue
-            link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
+            if not_package:
+                if (project_code, package_code, tenderee, tenderer, budget_, bid_amount_) in link_set:
+                    continue
+                link_set.add((project_code, package_code, tenderee, tenderer, budget_, bid_amount_))
+            else:
+                if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
+                    continue
+                link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
 
             package = package_code if package_code else str(len(prem_dic)+1) #str(i+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
             package = uniform_package_name(package)
@@ -5205,7 +5283,7 @@ class CandidateExtractor(object):
         roles = []
         if ners:
             for ner in ners[0]:
-                if ner[2] in ['org', 'company']:
+                if ner[2] in ['org', 'company', 'location']:
                     roles.append(ner[3])
         if roles and len(''.join(roles)) > len(text)*0.8:
             return roles[0]
@@ -5228,9 +5306,9 @@ class CandidateExtractor(object):
             second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
             third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
 
-            if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_sort, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
+            if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
                 break
-            if len(set([package_code_raw, candidate_, win_or_not, bid_amount_, win_sort, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2:  # 全部为空或内容一样 停止匹配
+            if len(set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2:  # 全部为空或内容一样 停止匹配
                 break
 
             if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
@@ -5412,6 +5490,7 @@ class CandidateExtractor(object):
 
     def predict(self, html, list_sentences, list_entitys, nlp_enterprise):
         self.nlp_enterprise = nlp_enterprise
+        html = html.replace('比选申请单位', '中标候选人')  # 82347769
         soup = BeautifulSoup(html, 'lxml')
         richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
         if richText: