浏览代码

Merge remote-tracking branch 'origin/master'

luojiehua 2 年之前
父节点
当前提交
d3c086d146

+ 31 - 12
BiddingKG/dl/interface/Entitys.py

@@ -204,6 +204,7 @@ class Entity():
         self.origin_entity_text = ''  # 2022/1/5 新增,记录字典替换的原来的实体名
         self.in_attachment = in_attachment  # 2022/02/10添加,实体是否在附件中
         self.prob = prob  # 2022/06/20添加,实体的概率
+        self.ratio_value = None # 2022/10/18 新增费率处理数据,(value,ratio_type) 费率数值,类型
 
     def set_Role(self,role_label,role_values):
         self.label = int(role_label)
@@ -294,7 +295,7 @@ class Role():
         self.linklist = linklist
         self.money_unit = '' # 2021/8/17 新增 保存金额单位
         # 中投标人属性
-        self.ratio = "" #2022/01/06 新增 保存中投标金额相关费率
+        self.ratio = None #2022/01/06 新增 保存中投标金额相关费率 (ratio_value,ratio_type)
         self.serviceTime = "" #2021/01/06 新增 保存服务期限(工期)
         self.address = ""  #2022/08/08 新增 角色地址
 
@@ -307,17 +308,35 @@ class Role():
         downward_floating_ratio = "" # 下浮率
         discount_ratio = "" # 折扣率/费率
         if self.ratio:
-            num_value = re.search("[\d\.]+",self.ratio).group()
-            num_value = float(num_value)
-            if re.search("%|百分之",self.ratio):
-                num_value = num_value / 100
-            num_value = str('%.4f'%(num_value))
-            if re.search("上浮",self.ratio):
-                floating_ratio = num_value
-            elif re.search("下浮",self.ratio):
-                downward_floating_ratio = num_value
-            else:
-                discount_ratio = num_value
+            # num_value = re.search("\d+(?:\.\d+)?",self.ratio).group()
+            # num_value = float(num_value)
+            # _decimal = str(num_value).split('.')[1]
+            # if _decimal=='0':
+            #     round_len = 0
+            # else:
+            #     round_len = len(_decimal)
+            # if re.search("%|百分之",self.ratio):
+            #     num_value = num_value * 0.01
+            #     round_len += 2
+            # elif re.search("‰|千分之",self.ratio):
+            #     num_value = num_value * 0.001
+            #     round_len += 3
+            # num_value = str(round(num_value,round_len))
+            #
+            # if re.search("上浮",self.ratio):
+            #     floating_ratio = num_value
+            # elif re.search("下浮",self.ratio):
+            #     downward_floating_ratio = num_value
+            # else:
+            #     discount_ratio = num_value
+            ratio_type = self.ratio[1]
+            ratio_value = str(self.ratio[0])
+            if ratio_type=='floating_ratio':
+                floating_ratio = ratio_value
+            elif ratio_type=='downward_floating_ratio':
+                downward_floating_ratio = ratio_value
+            elif ratio_type=='discount_ratio':
+                discount_ratio = ratio_value
         result = {'role_name':self.role_name,'role_text':fitDataByRule(self.entity_text),
                   'role_money': {'money':self.money,'money_unit':self.money_unit,'floating_ratio':floating_ratio,'downward_floating_ratio':downward_floating_ratio,'discount_ratio':discount_ratio},
                   'linklist': self.linklist,'serviceTime':self.serviceTime,'address':self.address}

+ 34 - 22
BiddingKG/dl/interface/Preprocessing.py

@@ -1048,8 +1048,12 @@ def tableToText(soup):
                 _td_len_list.append(len_td)
             if _td_len_list:
                 if len(list(set(_td_len_list))) >= 8 or max(_td_len_list) > 100:
+                    string_list = [re.sub("\s+","",i)for i in tbody.strings if i and i!='\n']
+                    tbody.string = ",".join(string_list)
+                    table_max_len = 30000
+                    tbody.string = tbody.string[:table_max_len]
+                    tbody.name = "turntable"
                     return None
-
         # fixSpan(tbody)
         # inner_table = getTable(tbody)
         # inner_table = fixTable(inner_table)
@@ -1059,7 +1063,8 @@ def tableToText(soup):
         inner_table = fixTable(inner_table)
 
         if inner_table == []:
-            tbody.string = segment(tbody,final=False)
+            string_list = [re.sub("\s+", "", i) for i in tbody.strings if i and i != '\n']
+            tbody.string = ",".join(string_list)
             table_max_len = 30000
             tbody.string = tbody.string[:table_max_len]
             # log('异常表格直接取全文')
@@ -1119,7 +1124,7 @@ def tableToText(soup):
             tag.extract()
     for ul in soup.find_all('ul'): #例子 156439663 多个不同channel 类别的标题
         if ul.find_all('li') == ul.findChildren(recursive=False) and len(set(re.findall(
-                '招标公告|中标结果公示|中标候选人公示|招标答疑|开标评标|合同履?约?公示|开标评标|资格评审',
+                '招标公告|中标结果公示|中标候选人公示|招标答疑|开标评标|合同履?约?公示|资格评审',
                 ul.get_text(), re.S)))>3:
             ul.extract()
 
@@ -1307,7 +1312,6 @@ def segment(soup,final=True):
     commaList = ["div","br","td","p","li"]
     #commaList = []
     spaceList = ["span"]
-
     tbodies = soup.find_all('tbody')
     if len(tbodies) == 0:
         tbodies = soup.find_all('table')
@@ -2110,7 +2114,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         doc_id = article[0]
         sourceContent = article[1]
         sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
-
+        sourceContent = re.sub("##attachment##","",sourceContent)
         sourceContent = sourceContent.replace('<br/>', '<br>')
         sourceContent = re.sub("<br>(\s{0,}<br>)+","<br>",sourceContent)
         # for br_match in re.findall("[^>]+?<br>",sourceContent):
@@ -2164,6 +2168,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = article_processed.replace('招标(建设)单位', '招标单位')  #2022/8/10 修正预测不到表达
         article_processed = re.sub("采购商(?=[^\u4e00-\u9fa5]|名称)", "招标人", article_processed)
         article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
+        article_processed = article_processed.replace('\(%)', '')    # 中标(成交)金额(元)\(%):498888.00, 处理 江西省政府采购网  金额特殊问题
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -2320,7 +2325,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
                             _text += sentences[idx]
                 _outline.outline_text = _text
                 _outline_summary = re.split("[::,]",_text,1)[0]
-                if len(_outline_summary)<20:
+                if len(_outline_summary)<30:
                     _outline.outline_summary = _outline_summary
                 # print(_outline.outline_index,_outline.outline_text)
 
@@ -2488,16 +2493,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     continue
                 elif entity_type=="person" and len(entity_text)>10 and len(re.findall("[\u4e00-\u9fa5]",entity_text))<len(entity_text)/2:
                     continue
-                # 组织机构实体名称补充
-                if entity_type in ["org", "company"]:
-                    fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
-                    if fix_name:
-                        if len(fix_name.group(2))>0:
-                            entity_text = entity_text.replace(fix_name.group(), "有限责任公司")
-                        elif len(fix_name.group(3))>0:
-                            entity_text = entity_text.replace(fix_name.group(), "有限公司")
-                        elif re.search("有限$", entity_text):
-                            entity_text = re.sub("有限$","有限公司",entity_text)
+
 
                 for j in range(len(list_tokenbegin)):
                     if list_tokenbegin[j]==begin_index_temp:
@@ -2516,6 +2512,16 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 #去掉标点符号
                 entity_text = re.sub("[,,。:!&@$\*]","",entity_text)
                 entity_text = entity_text.replace("(","(").replace(")",")") if isinstance(entity_text,str) else entity_text
+                # 组织机构实体名称补充
+                if entity_type in ["org", "company"]:
+                    fix_name = re.search("(有限)([责贵]?任?)(公?司?)",entity_text)
+                    if fix_name:
+                        if len(fix_name.group(2))>0:
+                            entity_text = entity_text.replace(fix_name.group(), "有限责任公司")
+                        elif len(fix_name.group(3))>0:
+                            entity_text = entity_text.replace(fix_name.group(), "有限公司")
+                        elif re.search("有限$", entity_text):
+                            entity_text = re.sub("有限$","有限公司",entity_text)
                 list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
             # 标记文章末尾的"发布人”、“发布时间”实体
             if sentence_index==len(list_sentence)-1:
@@ -2537,9 +2543,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
-                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
-                                  "behind_m":"(()()(?P<money_behind_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
+                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>-?[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>-?[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
+                                  "behind_m":"(()()(?P<money_behind_m>-?[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
             # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
 
             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
@@ -2693,6 +2699,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             break
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
 
+                    symbol = '-' if entity_text.startswith('-') else ''  # 负值金额前面保留负号
+
                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
                     # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
                     if re.search('总投资|投资总额|总预算|总概算|投资规模|批复概算', sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
@@ -2754,6 +2762,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             _exists = True
                     if not _exists:
                         if float(entity_text)>1:
+                            if symbol == '-': # 负值金额保留负号
+                                entity_text = '-'+entity_text
                             list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,begin_index_temp,end_index_temp,in_attachment=in_attachment))
                             list_sentence_entitys[-1].notes = notes  # 2021/7/20 新增金额备注
                             list_sentence_entitys[-1].money_unit = unit  # 2021/7/20 新增金额备注
@@ -2924,9 +2934,11 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                         break
                 entity_id = "%s_%d_%d_%d" % (doc_id, sentence_index, begin_index, end_index)
                 entity_text = ratio['body']
-                list_sentence_entitys.append(
-                    Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
-                           begin_index_temp, end_index_temp,in_attachment=in_attachment))
+                ratio_value = (ratio['value'],ratio['type'])
+                _entity = Entity(doc_id, entity_id, entity_text, entity_type, sentence_index, begin_index, end_index,
+                           begin_index_temp, end_index_temp,in_attachment=in_attachment)
+                _entity.ratio_value = ratio_value
+                list_sentence_entitys.append(_entity)
 
             list_sentence_entitys.sort(key=lambda x:x.begin_index)
             list_entitys_temp = list_entitys_temp+list_sentence_entitys

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -254,7 +254,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2022-11-24'}
+    version_date = {'version_date': '2022-12-13'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise

+ 166 - 72
BiddingKG/dl/interface/getAttributes.py

@@ -31,6 +31,12 @@ dict_role_id = {"0":"tenderee",
                 "3":"second_tenderer",
                 "4":"third_tenderer"}
 
+role2id_dict = {"tenderee":0,
+                "agency":1,
+                "win_tenderer":2,
+                "second_tenderer":3,
+                "third_tenderer":4}
+
 def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
     '''
     @param:
@@ -1097,7 +1103,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     def addRatioByEntity(packDict,packageName,entity,ratio):
         for i in range(len(packDict[packageName]["roleList"])):
             if packDict[packageName]["roleList"][i].entity_text==entity:
-                packDict[packageName]["roleList"][i].ratio = ratio.entity_text
+                packDict[packageName]["roleList"][i].ratio = ratio.ratio_value
     def addServiceTimeByEntity(packDict,packageName,entity,serviceTime):
         for i in range(len(packDict[packageName]["roleList"])):
             if packDict[packageName]["roleList"][i].entity_text==entity:
@@ -1119,13 +1125,29 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     p_entity = 0
 
     # 2021/7/19 顺序比较金额,前面是后面的一万倍则把前面金额/10000
-    money_list = [it for it in list_entity if it.entity_type=="money"]
-    for i in range(len(money_list)-1):
-        for j in range(1, len(money_list)):
-            if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
-                    Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
-                money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
-                # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
+    # money_list = [it for it in list_entity if it.entity_type=="money"]
+    # for i in range(len(money_list)-1):
+    #     for j in range(1, len(money_list)):
+    #         if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
+    #                 Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
+    #             money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
+    #             # print('连接前修改大于50亿金额:前面是后面的一万倍则把前面金额/10000')
+
+    '''同样金额同时有元及万元单位的,把万元的金额改为元'''
+    wanyuan = []
+    yuan = []
+    for it in list_entity:
+        if it.entity_type == "money" and float(it.entity_text)>5000:
+            if it.money_unit == '万元':
+                wanyuan.append(it)
+            else:
+                yuan.append(it)
+    if wanyuan != [] and yuan != []:
+        for m1 in wanyuan:
+            for m2 in yuan:
+                if Decimal(m1.entity_text)/Decimal(m2.entity_text) == 10000:
+                    m1.entity_text = m2.entity_text
+
     
     #遍历所有实体
     # while(p_entity<len(list_entity)):
@@ -1486,7 +1508,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     for one_phone in _phone:
                         PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
                         agency_phone.add(one_phone)
-
     # 正则提取电话号码实体
     # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
     phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
@@ -1499,14 +1520,18 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
                        '0[1-9]\d{1,2}[-—-―]?[1-9]\d{6}\d?|'
                        '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?\d{7,8}-?\d{,4}|'
+                       '400\d{7}转\d{1,4}|'
                        '[2-9]\d{6,7}')
-    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
+    url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[#$\-_@.&+=\?:/]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
     email_pattern = re.compile("[a-zA-Z0-9][a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@"
                             "[a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*(?:\.[a-zA-Z]{2,})")
     phone_entitys = []
     code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
     for _sentence in list_sentence:
         sentence_text = _sentence.sentence_text
+        # 过长数字串直接过滤替换
+        for _re in re.findall("\d{50,}",sentence_text):
+            sentence_text = sentence_text.replace(_re,"#"*len(_re))
         in_attachment = _sentence.in_attachment
         list_tokenbegin = []
         begin = 0
@@ -1533,6 +1558,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 continue
             res_set.add((i.group(), i.start(), i.end()))
         res_set = sorted(list(res_set),key=lambda x:x[1])
+        # 限制数量,防止异常数据处理时间过长
+        res_set = res_set[:200]
         last_phone_mask = True
         error_numStr_index = []
         sentence_phone_list = []
@@ -1554,7 +1581,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 pass
             else:
                 # 排除“传真号”和其它错误项
-                if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
+                if re.search("传,?真|信,?箱|邮,?[箱件]|QQ|qq", phone_left):
                     if not re.search("电,?话", phone_left):
                         error_numStr_index.append(numStr_index)
                         last_phone_mask = False
@@ -1596,6 +1623,20 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             error_numStr_index.append(numStr_index)
                             last_phone_mask = False
                             continue
+                left_context = re.search("[\da-zA-Z\-—-―]+$",sentence_text[:item[1]])
+                if left_context:
+                    if len(left_context.group()) != len("".join(re.findall(phone, left_context.group()))):
+                    # if not re.search("(" + phone.pattern + ")$", left_context.group()):
+                        error_numStr_index.append(numStr_index)
+                        last_phone_mask = False
+                        continue
+                right_context = re.search("^[\da-zA-Z\-—-―]+", sentence_text[item[2]:])
+                if right_context:
+                    if len(right_context.group()) != len("".join(re.findall(phone, right_context.group()))):
+                    # if not re.search("^(" + phone.pattern + ")", right_context.group()):
+                        error_numStr_index.append(numStr_index)
+                        last_phone_mask = False
+                        continue
                 # if:上一个phone实体不符合条件
                 if not last_phone_mask:
                     item_start = item[1]
@@ -1771,52 +1812,58 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 break
                 # print(3,combo[0].entity_text,combo[1].entity_text)
 
-        # "公司——地址" 链接规则补充
-        company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
-        company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
-        t_match_list = []
-        for ent_idx in range(len(company_lacation_EntityList)):
-            entity = company_lacation_EntityList[ent_idx]
-            if entity.entity_type in ['company', 'org']:
-                match_nums = 0
-                company_nums = 0  # 经过其他公司的数量
-                location_nums = 0  # 经过电话的数量
-                for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
-                    after_entity = company_lacation_EntityList[after_index]
-                    if after_entity.entity_type == "location":
-                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
-                                tokens_num_dict[entity.sentence_index] + entity.end_index)
-                        location_nums += 1
-                        if distance > 100 or location_nums >= 3:
-                            break
-                        sentence_distance = after_entity.sentence_index - entity.sentence_index
-                        value = (-1 / 2 * (distance ** 2)) / 10000
-                        if sentence_distance == 0:
-                            if distance < 80:
-                                t_match_list.append(Match(entity, after_entity, value))
-                                match_nums += 1
-                                if company_nums:
-                                    break
-                        else:
-                            if distance < 50:
-                                t_match_list.append(Match(entity, after_entity, value))
-                                match_nums += 1
-                                if company_nums:
-                                    break
+    # "公司——地址" 链接规则补充
+    company_lacation_EntityList = [ent for ent in pre_entity if ent.entity_type in ['company', 'org', 'location']]
+    # company_lacation_EntityList = [ent for ent in pre_entity if (ent.entity_type in ['company', 'org'] and ent.label!=5) or ent.entity_type=="location"]
+    company_lacation_EntityList = sorted(company_lacation_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
+    t_match_list = []
+    for ent_idx in range(len(company_lacation_EntityList)):
+        entity = company_lacation_EntityList[ent_idx]
+        if entity.entity_type in ['company', 'org'] and entity.label!=5:
+            match_nums = 0
+            company_nums = 0  # 经过其他公司的数量
+            location_nums = 0  # 经过电话的数量
+            for after_index in range(ent_idx + 1, min(len(company_lacation_EntityList), ent_idx + 5)):
+                after_entity = company_lacation_EntityList[after_index]
+                if after_entity.entity_type == "location":
+                    distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
+                            tokens_num_dict[entity.sentence_index] + entity.end_index)
+                    location_nums += 1
+                    if distance > 100 or location_nums >= 3:
+                        break
+                    sentence_distance = after_entity.sentence_index - entity.sentence_index
+                    value = (-1 / 2 * (distance ** 2)) / 10000
+                    if sentence_distance == 0:
+                        if distance < 80:
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if company_nums:
+                                break
                     else:
-                        # type:company/org
-                        company_nums += 1
-                        if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
-                            break
+                        if distance < 50:
+                            t_match_list.append(Match(entity, after_entity, value))
+                            match_nums += 1
+                            if company_nums:
+                                break
+                else:
+                    # type:company/org
+                    company_nums += 1
+                    if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
+                        break
+                    if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
+                        break
 
-        # km算法分配求解
-        relate_location_result = dispatch(t_match_list)
-        relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
-        for match in relate_location_result:
-            _company = match[0]
-            _relation = match[1]
-            if not _company.pointer_address:
-                _company.pointer_address = _relation
+    # km算法分配求解
+    # for item in t_match_list:
+    #     print("loc_rela",item.main_role.entity_text,item.attribute.entity_text)
+    relate_location_result = dispatch(t_match_list)
+    relate_location_result = sorted(relate_location_result, key=lambda x: (x[0].sentence_index, x[0].begin_index))
+    for match in relate_location_result:
+        _company = match[0]
+        _relation = match[1]
+        # print("loc_rela2", _company.entity_text, _relation.entity_text, )
+        if not _company.pointer_address:
+            _company.pointer_address = _relation
     # "联系人——联系电话" 链接规则补充
     person_phone_EntityList = [ent for ent in pre_entity+ phone_entitys if ent.entity_type not in ['company','org','location']]
     person_phone_EntityList = sorted(person_phone_EntityList, key=lambda x: (x.sentence_index, x.begin_index))
@@ -2018,7 +2065,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                             for _p in person_phone:
                                 if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
                                     PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
-
     re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
     split_list = [0] * 16
     split_dict = {
@@ -2375,7 +2421,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         prepare_link.append(after_entity)
                         last_person = after_entity
                         continue
-
     # 统一同类角色的属性
     for k in PackDict.keys():
         for i in range(len(PackDict[k]["roleList"])):
@@ -2428,6 +2473,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 PackDict[k]["roleList"][i].linklist.remove(_item)
 
     # PackDict更新company/org地址
+    last_role_prob = {}
     for ent in pre_entity:
         if ent.entity_type in ['company','org']:
             if ent.pointer_address:
@@ -2436,9 +2482,16 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         if PackDict[k]["roleList"][i].entity_text == ent.entity_text:
                             if not PackDict[k]["roleList"][i].address:
                                 PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
+                                last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
                             else:
-                                if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
-                                    PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
+                                if PackDict[k]["roleList"][i].role_name in ['tenderee','agency']:
+                                    # 角色为招标/代理人时,取其实体概率高的链接地址作为角色address
+                                    if ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]] > last_role_prob[PackDict[k]["roleList"][i].role_name]:
+                                        PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
+                                        last_role_prob[PackDict[k]["roleList"][i].role_name] = ent.values[role2id_dict[PackDict[k]["roleList"][i].role_name]]
+                                else:
+                                    if len(ent.pointer_address.entity_text) > len(PackDict[k]["roleList"][i].address):
+                                        PackDict[k]["roleList"][i].address = ent.pointer_address.entity_text
 
     # 联系人——电子邮箱链接
     temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
@@ -2766,7 +2819,13 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     for _index in range(len(PackageList)):
         if "hit" in PackageList[_index]:
             for _hit in list(PackageList[_index]["hit"]):
-                _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
+                if len(_hit.split("-"))==3:
+                    _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
+                # 补充金额前新增负号‘-’导致错误的规则
+                elif len(_hit.split("-"))==4:
+                    _money = float(_hit.split("-")[2]) if _hit.split("-")[0] == "money" else None
+                else:
+                    _money = None
                 if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
                     dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
     #只找到一个中标人和中标金额
@@ -2776,7 +2835,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
         # print('一个中标人一个金额:', list(set_tenderer_money)[0])
     #找到一个中标人和多个招标金额
     if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
-        _maxMoney = 0
+        _maxMoney = list(set_tenderer_money)[0]
         _sumMoney = 0
         for _m in list(set_tenderer_money):
             _sumMoney += _m
@@ -3033,7 +3092,9 @@ def getTimeAttributes(list_entity,list_sentence):
         'time_earnestMoneyStart': [], #10 保证金递交开始时间(保证金递交时间)
         'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间
         'time_commencement':[] , #13 开工日期
-        'time_completion': []  # 14 竣工日期
+        'time_completion': [],  # 14 竣工日期
+        'time_listingStart': [],  # 15 挂牌开始日期(挂牌时间)
+        'time_listingEnd': []  # 16 挂牌结束日期、挂牌截止日期
     }
     last_sentence_index = 0
     last_time_type = ""
@@ -3044,22 +3105,49 @@ def getTimeAttributes(list_entity,list_sentence):
         'time_registrationStart':"time_registrationEnd",
         'time_earnestMoneyStart':"time_earnestMoneyEnd",
         'time_commencement':"time_completion",
+        'time_listingStart':"time_listingEnd"
     }
     for entity in time_entitys:
         sentence_text = list_sentence[entity.sentence_index].sentence_text
         entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
+        entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
         entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
         label_prob = entity.values[entity.label]
         entity_text = entity.entity_text
         in_attachment = entity.in_attachment
         extract_time = my_timeFormat(entity_text)
         if extract_time:
+            # 2022/12/12 新增挂牌时间正则
+            if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
+                if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
+                    if len(extract_time) == 1:
+                        if re.search("挂牌.?(开始|起始).?(?:时间|日期)",entity_left2):
+                            dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
+                            last_time_type = 'time_listingStart'
+                        elif re.search("挂牌.?(截[止至]|结束).?(?:时间|日期)",entity_left2):
+                            dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
+                            last_time_type = 'time_listingEnd'
+                        elif re.search("挂牌.?(?:时间|日期)",entity_left2):
+                            if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
+                                dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
+                                last_time_type = 'time_listingEnd'
+                            else:
+                                dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
+                                last_time_type = 'time_listingStart'
+                    else:
+                        dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
+                        dict_time['time_listingEnd'].append((extract_time[1], 0.5, in_attachment))
+                        last_time_type = ''
+                    last_sentence_index = entity.sentence_index
+                    continue
+
             if re.search("至|到", entity_left):
                 if entity.sentence_index == last_sentence_index:
                     time_type = last_time_index.get(last_time_type)
                     if time_type:
                         dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10,in_attachment))
                         last_time_type = ""
+                        last_sentence_index = entity.sentence_index
                         continue
             if entity.label!=0:
                 if entity.label==1 and label_prob>0.5:
@@ -3398,30 +3486,36 @@ def update_prem(old_prem, new_prem):
         for k, v in new_prem.items():
             if k == 'Project':
                 if 'Project' in old_prem:
+                    tmp_l = [] # 保存新旧同时包含的角色
                     for d in old_prem['Project']['roleList']:
                         for d2 in v['roleList']:
-                            if d['role_name'] == d2['role_name']:
+                            if d['role_name'] == d2['role_name']: # 同时包含的角色用表格的替换
+                                tmp_l.append(d2)
                                 d['role_text'] = d2['role_text']
-                                d['role_money']['money'] = d2['role_money']['money']
-                                d['role_money']['money_unit'] = d2['role_money']['money_unit']
-                                v['roleList'].remove(d2)
+                                if d2['role_money']['money'] != 0:  # 如果表格提取的金额不为0才替换
+                                    d['role_money']['money'] = d2['role_money']['money']
+                                    d['role_money']['money_unit'] = d2['role_money']['money_unit']
                     for d2 in v['roleList']:
-                        old_prem['Project']['roleList'].append(d2)
+                        if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
+                            old_prem['Project']['roleList'].append(d2)
                 else:
                     old_prem[k] = v
             else:
-                if k not in old_prem:
+                if k not in old_prem:  # 新有旧没有的包直接添加
                     old_prem[k] = v
                 else:
+                    tmp_l = []  # 保存新旧同时包含的角色
                     for d in old_prem[k]['roleList']:
                         for d2 in v['roleList']:
                             if d['role_name'] == d2['role_name']:
+                                tmp_l.append(d2)
                                 d['role_text'] = d2['role_text']
-                                d['role_money']['money'] = d2['role_money']['money']
-                                d['role_money']['money_unit'] = d2['role_money']['money_unit']
-                                v['roleList'].remove(d2)
+                                if d2['role_money']['money'] != 0: # 如果表格提取的金额不为0才替换
+                                    d['role_money']['money'] = d2['role_money']['money']
+                                    d['role_money']['money_unit'] = d2['role_money']['money_unit']
                     for d2 in v['roleList']:
-                        old_prem[k]['roleList'].append(d2)
+                        if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
+                            old_prem[k]['roleList'].append(d2)
 
     # return old_prem
 

+ 82 - 25
BiddingKG/dl/interface/predictor.py

@@ -1493,7 +1493,7 @@ class RoleRuleFinalAdd():
         for ent in list_entitys[0]:
             if ent.entity_type in ['org', 'company']:
                 if ent.label == 0 and ent.values[ent.label]>=0.5:
-                    if '公共资源交易中心' in ent.entity_text:
+                    if '公共资源交易中心' in ent.entity_text:  # 公共资源交易中心不算招标或代理,只算平台
                         ent.label = 5
                         continue
                     tenderee_list.append(ent.entity_text)
@@ -2567,15 +2567,15 @@ class ProductAttributesPredictor():
                 col1_l = []
                 for tds in inner_table:
                     if len(tds) == 2:
-                        col0_l.append(re.sub('[::]', '', tds[0]))
+                        col0_l.append(re.sub('[::]', '', tds[0]))  # 处理只有两列的情况
                         col1_l.append(tds[1])
-                    elif len(tds)>=4 and len(inner_table)==2:
+                    elif len(tds)>=4 and len(inner_table)==2:  # 处理只有两行的情况
                         col0_l = inner_table[0]
                         col1_l = inner_table[1]
                         break
                 # print(set(col0_l))
                 # print('head: ',set(col0_l) & self.header_set)
-                if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2:
+                if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2 and len(col0_l)==len(col1_l): # 保证两个列数一致
                     header_list2 = []
                     product = demand = budget = order_begin = order_end = ""
                     for i in range(len(col0_l)):
@@ -2616,7 +2616,7 @@ class ProductAttributesPredictor():
             while i < (len(inner_table)):
                 tds = inner_table[i]
                 not_empty = [it for it in tds if it != ""]
-                if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
+                if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2: # 一半列是空的或者小于两列的 继续
                     i += 1
                     continue
                 product = ""  # 产品
@@ -2634,6 +2634,8 @@ class ProductAttributesPredictor():
                 if len(set([re.sub('[::]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
                 # if len(set(tds) & self.header_set) > len(tds) * 0.2:
                     header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
+                    if found_header:
+                        header_colnum = len(tds) # 保存表头所在行列数
                     if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
                             quantity_header = header_list[1].replace('单位:', '')
                             if re.search('(([\w/]{,5}))', quantity_header):
@@ -2644,7 +2646,6 @@ class ProductAttributesPredictor():
                     if found_header and len(headers)<1:  # 只保留出现的第一个表头
                         headers.append('_'.join(header_list))
                         headers_demand.append('_'.join(header_list2))
-                        header_colnum = len(tds)
                         header_col.append('_'.join(tds))
                     i += 1
                     continue
@@ -2662,6 +2663,18 @@ class ProductAttributesPredictor():
                     id6 = header_dic.get('需求', "")
                     id7 = header_dic.get('预算', "")
                     id8 = header_dic.get('时间', "")
+
+                    not_attr = 0
+                    for k, v in header_dic.items():
+                        if isinstance(v, int):
+                            if v >= len(tds) or tds[v] in self.header_set:
+                                not_attr = 1
+                                break
+                    if not_attr: # 只要属性里面有一项为表头,停止匹配
+                        i += 1
+                        found_header = False
+                        continue
+
                     if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
                             re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
                         product = tds[id1]
@@ -2801,16 +2814,19 @@ class ProductAttributesPredictor():
         list_outline = list_outlines[0]
         get_product_attrs = False
         for _outline in list_outline:
-            if re.search("信息|情况|清单",_outline.outline_summary):
+            if re.search("信息|情况|清单|概况",_outline.outline_summary):
                 outline_text = _outline.outline_text
                 outline_text = outline_text.replace(_outline.outline_summary,"")
                 key_value_list = [_split for _split in re.split("[,。;]",outline_text) if re.search("[::]",_split)]
+                if not key_value_list:
+                    continue
                 head_list = []
                 head_value_list = []
                 for key_value in key_value_list:
                     key_value = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key_value)
                     temp = re.split("[::]",key_value)
                     key = temp[-2]
+                    key = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key)
                     value = temp[-1]
                     head_list.append(key)
                     head_value_list.append(value)
@@ -2840,6 +2856,7 @@ class ProductAttributesPredictor():
                             tmp_head_list = head_list[begin_list[idx]:begin_list[idx+1]]
                         product = ""  # 产品
                         quantity = ""  # 数量
+                        quantity_unit = "" # 单位
                         unitPrice = ""  # 单价
                         brand = ""  # 品牌
                         specs = ""  # 规格
@@ -2857,6 +2874,7 @@ class ProductAttributesPredictor():
                             # print('header_dic: ',header_dic)
                             id1 = header_dic.get('名称', "")
                             id2 = header_dic.get('数量', "")
+                            id2_2 = header_dic.get('单位', "")
                             id3 = header_dic.get('单价', "")
                             id4 = header_dic.get('品牌', "")
                             id5 = header_dic.get('规格', "")
@@ -2870,8 +2888,25 @@ class ProductAttributesPredictor():
                                 if id2 != "":
                                     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
                                         quantity = deal_list[id2]
+                                        quantity = re.sub('[()(),,约]', '', quantity)
+                                        quantity = re.sub('[一壹]', '1', quantity)
+                                        ser = re.search('^(\d+(?:\.\d+)?)([㎡\w/]{,5})', quantity)
+                                        if ser:
+                                            quantity = str(ser.group(1))
+                                            quantity_unit = ser.group(2)
+                                        else:
+                                            quantity = ""
+                                            quantity_unit = ""
+                                if id2_2 != "":
+                                    if re.search('^\w{1,4}$', deal_list[id2_2]):
+                                        quantity_unit = deal_list[id2_2]
                                     else:
-                                        quantity = ""
+                                        quantity_unit = ""
+                                # if id2 != "":
+                                #     if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
+                                #         quantity = deal_list[id2]
+                                #     else:
+                                #         quantity = ""
                                 if id3 != "":
                                     if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]):
                                         _unitPrice = deal_list[id3]
@@ -2912,7 +2947,7 @@ class ProductAttributesPredictor():
                                         order_begin, order_end = self.fix_time(order_time, html, page_time)
                                 # print(quantity,unitPrice,brand,specs)
                                 if quantity != "" or unitPrice != "" or brand != "" or specs != "":
-                                    link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
+                                    link = {'product': product, 'quantity': quantity, 'quantity_unit':quantity_unit,'unitPrice': unitPrice,
                                             'brand': brand[:50], 'specs': specs}
                                     if link not in product_link:
                                         product_link.append(link)
@@ -4266,6 +4301,7 @@ class DistrictPredictor():
             self.full_name = full_name
             self.short2id = short2id
             self.full2id = full2id
+        # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
 
     def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
         '''
@@ -4330,7 +4366,7 @@ class DistrictPredictor():
                                     if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
                                         type_w = 2
                                     else:
-                                        type_w = 1
+                                        type_w = 0.5
                                 id_set.add(_id)
                                 score2 += w * type_w
                             score_l.append([_id, score * w + score2] + area)
@@ -4409,17 +4445,14 @@ class DistrictPredictor():
 
         def get_all_addr(list_entitys):
             tenderee_l = []
-            other_roles = []
             addr_l = []
             for ent in list_entitys[0]:
-                if ent.entity_type == 'location':
+                if ent.entity_type == 'location' and len(ent.entity_text)>2:
                     addr_l.append(ent.entity_text)
                 elif ent.entity_type in ['org', 'company']:
-                    if ent.label == 0:
+                    if ent.label in [0, 1]:  # 加招标或代理
                         tenderee_l.append(ent.entity_text)
-                    else:
-                        other_roles.append(ent.entity_text)
-            return ' '.join(addr_l), ' '.join(tenderee_l), ' '.join(other_roles)
+            return ' '.join(addr_l), ' '.join(tenderee_l)
 
         def get_title_addr(text):
             p1 = '(\w{2,8}[省市州区县][^\w]*)+'
@@ -4436,21 +4469,26 @@ class DistrictPredictor():
             content = list_articles[0].content
 
         tenderee, tenderee_address = get_ree_addr(prem)
+        msc = ""
         pro_addr = get_project_addr(content)
         if pro_addr != "":
+            msc += '使用规则提取的项目地址;'
             tenderee_address = pro_addr
         else:
             role_addr = get_role_address(content)
             if role_addr != "":
+                msc += '使用规则提取的联系人地址;'
                 tenderee_address = role_addr
 
         if tenderee_address == "":
             title_addr = get_title_addr(title)
             if title_addr != "":
+                msc += '使用规则提取的标题地址;'
                 tenderee_address = title_addr
             else:
                 bid_addr = get_bid_addr(content)
                 if bid_addr != "":
+                    msc += '使用规则提取的开标地址;'
                     tenderee_address = bid_addr
 
         project_name = str(project_name)
@@ -4466,24 +4504,31 @@ class DistrictPredictor():
         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  #预防提取错 合肥 路南 新会 等地区
         # print('text1:', text1)
+        msc += '## 第一次预测输入:%s ##;'%text1
         rs = get_area(text1, web_source_name)
-
+        msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
+        rs['district']['province'], rs['district']['city'], rs['district']['district'])
+        # self.f.write('%s %s \n' % (list_articles[0].id, msc))
+        # print('地区匹配:', msc)
         if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
-            all_addr, tenderees, other_roles = get_all_addr(list_entitys)
-            if tenderees != "":
-                text2 = tenderees + " " + all_addr
-                # print('所有地址:', all_addr)
-            else:
-                text2 = other_roles + " " + all_addr
-                # text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
+            msc = ""
+            all_addr, tenderees = get_all_addr(list_entitys)
+            text2 = tenderees + " " + all_addr + ' ' + title
+            msc += '使用实体列表所有招标人+所有地址;'
+            # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
             text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
             # print('text2:', text2)
+            msc += '## 第二次预测输入:%s ##'%text2
             rs2 = get_area(text2, web_source_name, not_in_content=False)
             rs2['district']['is_in_text'] = True
             if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
                 rs = rs2
             elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
                 rs = rs2
+            msc += '预测结果:省份:%s, 城市:%s,区县:%s'%(
+                rs['district']['province'],rs['district']['city'],rs['district']['district'])
+        # self.f.write('%s %s \n'%(list_articles[0].id, msc))
+        # print('地区匹配:', msc)
         return rs
 
 class TableTag2List():
@@ -4664,6 +4709,13 @@ class TablePremExtractor(object):
             bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
             win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
 
+            if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset != set(): # 只要有一项为表头 停止匹配
+                break
+            if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) - set(['', ' ']) == set():  # 全部为空 停止匹配
+                break
+
+            if re.search('详见', project_name):  # 去除某些表达: 详见招标文件
+                project_name = ""
             if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
                 package_code_raw = project_name
                 project_name = ""
@@ -4797,7 +4849,7 @@ class TablePremExtractor(object):
                             else:
                                 table_items.append(trs[j])
                         else:
-                            print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
+                            # print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
                             break
                     if len(table_items) > 0:
                         df = pd.DataFrame(table_items)
@@ -4915,6 +4967,11 @@ class CandidateExtractor(object):
             second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
             third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
 
+            if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_sort, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
+                break
+            if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_sort, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' ']) == set():  # 全部为空 停止匹配
+                break
+
             if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
                 col_indx = headers['candidate'][0] -1
                 pre_col = df.loc[i, col_indx]

+ 184 - 12
BiddingKG/dl/ratio/re_ratio.py

@@ -1,14 +1,16 @@
 import re
-
+from decimal import Decimal
 # ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
 # ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[))]?[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
-ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[))]?[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?'
-                   '|[0-9]+\.?[0-9]{0,3}[((]?%?[))]?[((]?(费率|折扣率|(上浮|下浮)费?率)[))]?)')
-ratio = ratio.pattern
 
+ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率|优惠率)([((]?[%‰][))]?|)(报价|取值|)([((].{1,20}[))])?[))]?[为是:: ,]{0,3}'
+                   '([0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰]?[))]?|[百千]分之[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+(?:点[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+)?)'
+                   '|[0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰][))]?[((]?[\u4e00-\u9fa5]{,2}(?:费率|折扣率|优惠率|(上浮|下浮)费?率)[))]?)')
+ratio = ratio.pattern
+# print(ratio)
 
-# 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%,
-# 基准利率上浮率:百分之三十(30%)
+# 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%, X
+# 基准利率上浮率:百分之三十(30%) X
 # 租金上浮率
 # 上浮率活期20%
 # 上浮率:活期20%、一年定期35%
@@ -25,7 +27,12 @@ def re_standard_ratio(_str):
             m_span = m.span()
             keyword_index = [m_span[0], m_span[1]]
             keyword = m_dict.get("value")
-            ratio_list.append([keyword, keyword_index])
+            left = _str[max(0,m_span[0]-15):m_span[0]]
+            right = _str[m_span[1]:m_span[1]+10]
+            context = left + keyword + right
+            # print(1,keyword)
+            if not re.search("利率|保险",context) and not re.search("^[万元]",right):
+                ratio_list.append([keyword, keyword_index])
 
     return ratio_list
 
@@ -39,20 +46,185 @@ def re_ratio(text):
 def extract_ratio(text):
     result_list = []
     total_money_list = re_ratio(text)
+    # print(total_money_list)
     if total_money_list:
         for word, text_index in total_money_list:
-            d = {"body": word, "begin_index": text_index[0],
-                 "end_index": text_index[1]}
-            result_list.append(d)
+            num_value = re.search("\d+(?:\.\d+)?[((]?[%‰]?|[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十][零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]*(?:点[零壹贰叁肆伍陆柒捌玖一二三四五六七八九]+)?(?!分之)", word)
+            if num_value:
+                num_value = num_value.group()
+            else:
+                continue
+            if re.search("[零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]",num_value):
+                if '点' in num_value:
+                    num_split = num_value.split("点")
+                    round_len = len(num_split[1])
+                    num_integer = num_split[0]
+                    if re.search("^[十拾佰百]",num_integer):
+                        num_integer = "壹" + num_integer
+                    num_value = getUnifyNum(num_integer)
+                    for index,num_word in enumerate(list(num_split[1])):
+                        num_value = float(num_value) + getDigitsDic(num_word) * 0.1**(index+1)
+                else:
+                    round_len = 0
+                    num_value = getUnifyNum(num_value)
+                    num_value = float(num_value)
+                if re.search("%|百分之", word):
+                    num_value = num_value / 100
+                    round_len += 2
+                elif re.search("‰|千分之", word):
+                    num_value = num_value / 1000
+                    round_len += 3
+            else:
+                match_text = num_value
+                # print(num_value)
+                num_value = round(Decimal(re.sub('[((]|[%‰]','',num_value)),10)
+                # print(num_value)
+                # _num = str(num_value).split('.')[0]
+                if len(str(num_value).split('.'))<2:
+                    continue
+                _decimal = str(num_value).split('.')[1]
+                _decimal = re.sub("0+$","",_decimal)
+                # print(_decimal)
+                if _decimal=="":
+                    _decimal = "0"
+                # num_value = float(_num+"."+_decimal)
+                # print(num_value)
+                if _decimal == '0':
+                    round_len = 0
+                else:
+                    round_len = len(_decimal)
+                if num_value<1 and not re.search('[%‰]',match_text):
+                    pass
+
+                else:
+                    if re.search("%|百分之",word):
+                        num_value = num_value / 100
+                        round_len += 2
+                    elif re.search("‰|千分之",word):
+                        num_value = num_value / 1000
+                        round_len += 3
+                    else:
+                        num_value = num_value / 100
+                        round_len += 2
+
+            num_value = round(num_value, round_len)
+            # print(word,num_value)
+            if re.search("上浮",word):
+                ratio_type = 'floating_ratio'
+            elif re.search("下浮|优惠",word):
+                ratio_type = 'downward_floating_ratio'
+            elif re.search("折扣",word):
+                if num_value>0.5:
+                    ratio_type = 'discount_ratio'
+                else:
+                    ratio_type = 'downward_floating_ratio'
+            else:
+                ratio_type = 'discount_ratio'
+            if num_value<=1:
+                d = {"body": word, "begin_index": text_index[0],
+                     "end_index": text_index[1],"value":num_value,"type":ratio_type}
+                result_list.append(d)
     return result_list
 
 
+def getDigitsDic(unit):
+    '''
+    @summary:拿到中文对应的数字
+    '''
+    DigitsDic = {"零": 0, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5, "陆": 6, "柒": 7, "捌": 8, "玖": 9,
+                 "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
+    return DigitsDic.get(unit)
+
+
+def getMultipleFactor(unit):
+    '''
+    @summary:拿到单位对应的值
+    '''
+    MultipleFactor = {"兆": Decimal(1000000000000), "亿": Decimal(100000000), "万": Decimal(10000), "仟": Decimal(1000),
+                      "千": Decimal(1000), "佰": Decimal(100), "百": Decimal(100), "拾": Decimal(10), "十": Decimal(10),
+                      "元": Decimal(1), "圆": Decimal(1), "角": round(Decimal(0.1), 1), "分": round(Decimal(0.01), 2)}
+    return MultipleFactor.get(unit)
+
+
+def getUnifyNum(money):
+    '''
+    @summary:将中文金额字符串转换为数字金额
+    @param:
+        money:中文金额字符串
+    @return: decimal,数据金额
+    '''
+
+    MAX_MONEY = 1000000000000
+    MAX_NUM = 12
+    # 去掉逗号
+    money = re.sub("[,,]", "", money)
+    money = re.sub("[^0-9.一二三四五六七八九零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", money)
+    result = Decimal(0)
+    chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖","一","二","三","四","五","六","七","八","九"]
+    chnFactorUnits = ["圆", "元", "兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
+
+    LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
+    BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$" % ("".join(chnDigits)))
+    try:
+        if re.search(LowMoneypattern, money) is not None:
+            return Decimal(money)
+        elif re.search(BigMoneypattern, money) is not None:
+            return getDigitsDic(re.search(BigMoneypattern, money).group("BigMoney"))
+        for factorUnit in chnFactorUnits:
+            if re.search(re.compile(".*%s.*" % (factorUnit)), money) is not None:
+                subMoneys = re.split(re.compile("%s(?!.*%s.*)" % (factorUnit, factorUnit)), money)
+                if re.search(re.compile("^(\d+)(\.\d+)?$"), subMoneys[0]) is not None:
+                    if MAX_MONEY / getMultipleFactor(factorUnit) < Decimal(subMoneys[0]):
+                        return Decimal(0)
+                    result += Decimal(subMoneys[0]) * (getMultipleFactor(factorUnit))
+                elif len(subMoneys[0]) == 1:
+                    if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[0]) is not None:
+                        result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
+                # subMoneys[0]中无金额单位,不可再拆分
+                elif subMoneys[0] == "":
+                    result += 0
+                elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
+                    # print(subMoneys)
+                    # subMoneys[0] = subMoneys[0][0]
+                    result += Decimal(getUnifyNum(subMoneys[0])) * (getMultipleFactor(factorUnit))
+                else:
+                    result += Decimal(getUnifyNum(subMoneys[0])) * (getMultipleFactor(factorUnit))
+                if len(subMoneys) > 1:
+                    if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"), subMoneys[1]) is not None:
+                        result += Decimal(subMoneys[1])
+                    elif len(subMoneys[1]) == 1:
+                        if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[1]) is not None:
+                            result += Decimal(getDigitsDic(subMoneys[1]))
+                    else:
+                        result += Decimal(getUnifyNum(subMoneys[1]))
+                break
+    except Exception as e:
+        # traceback.print_exc()
+        return Decimal(0)
+    return result
+
+
 def test_str():
     s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
     s = '年利率较基准利率的上浮率(%): 30 活期存款下浮率:0.455% 协定存的下浮率,(1-下浮率)' \
         ' 上浮率....  上浮率30(%)  (下浮率%):43  下浮率报价0.5%'
-    s = '费率或单价等:报价:94.00%, 幕墙工程费率为25.08%, 投标成本警戒费率为90%, 下浮率3.15%'
-
+    s = '''费率%)61.20万
+费率(精确到小数点后两位)60.00%
+下浮率取值13%
+下浮率报价13%
+下浮率 百分之十点零陆(10.00%
+下浮率 大写:无 下浮率百分之贰拾陆 无 小写: 下浮26%
+下浮率% 30
+成交优惠率% 5.00
+下浮率 0.25
+下浮率 0.25%
+中标金额:57.75%(商业优惠率)
+费率):1800
+费率):12
+折扣率(%):99.2063
+投标报价:96.00%(折扣率
+'''
+    # s = '下浮率 百分之十点零陆(10.00%'
     print(extract_ratio(s))
 
 

文件差异内容过多而无法显示
+ 1 - 4
BiddingKG/dl/table_head/predict.py


+ 1 - 1
BiddingKG/dl_dev/test/test4.py

@@ -105,7 +105,7 @@ def run_one():
     # '''
     print("start")
     _time1 = time.time()
-    print(predict("12", text,"打印机"))
+    print(predict("12", text,""))
     # test(12,content)
     # test(12,text)
     print("takes",time.time()-a)

部分文件因为文件数量过多而无法显示