소스 검색

字典匹配规则优化;金额及角色规则优化;数据源唯一招标人更新

lsm 1 년 전
부모
커밋
8e9c4bf478

+ 7 - 2
BiddingKG/dl/entityLink/entityLink.py

@@ -114,7 +114,7 @@ def get_business_data(enterprise_name):
                     d = json.loads(_v)
                     d = json.loads(_v)
                     if d.get('have_business', '') == 1:
                     if d.get('have_business', '') == 1:
                         return True, d
                         return True, d
-                    return False, {}
+                    return False, d
                 else:
                 else:
                     return False, {}
                     return False, {}
         except Exception as e:
         except Exception as e:
@@ -332,6 +332,7 @@ SET_PREFIX_ENTERPRISE = set()
 SET_TAIL_ENTERPRISE = set()
 SET_TAIL_ENTERPRISE = set()
 SET_PREFIX_ENTERPRISE_HUGE_FILE = "SET_PREFIX_ENTERPRISE_HUGE.pk"
 SET_PREFIX_ENTERPRISE_HUGE_FILE = "SET_PREFIX_ENTERPRISE_HUGE.pk"
 SET_TAIL_ENTERPRISE_HUGE_FILE = "SET_TAIL_ENTERPRISE_HUGE.pk"
 SET_TAIL_ENTERPRISE_HUGE_FILE = "SET_TAIL_ENTERPRISE_HUGE.pk"
+
 def getDict_enterprise():
 def getDict_enterprise():
     global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
     global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
     real_path,is_huge = getEnterprisePath()
     real_path,is_huge = getEnterprisePath()
@@ -507,7 +508,9 @@ def match_enterprise_max_first(sentence):
                     for _i in range(_len):
                     for _i in range(_len):
                         enter_name = sentence[begin_index:begin_index+_len-_i]
                         enter_name = sentence[begin_index:begin_index+_len-_i]
                         enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
                         enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
-                        if enter_tail in SET_TAIL_ENTERPRISE:
+                        if re.search('[\u4e00-\u9fa5]', enter_tail) == None: # 20240111不包含中文后缀不要
+                            continue
+                        if enter_tail in SET_TAIL_ENTERPRISE or re.search('(中心|中学|小学|医院|学院|大学|学校|监狱|大队|支队|林场|海关|分局|商行)$', enter_tail):
                             if is_enterprise_exist(enter_name):
                             if is_enterprise_exist(enter_name):
                                 match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}
                                 match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}
                                 # print("match_item",key_enter,enter_name)
                                 # print("match_item",key_enter,enter_name)
@@ -567,6 +570,8 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                     if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
                     if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
                         find_flag = True
                         find_flag = True
                         #判断是否是多个公司
                         #判断是否是多个公司
+                        if re.search('[分支](公司|中心|部|行)', p_entity.entity_text):
+                            continue
                         for _match_j in range(_match_index,len(list_match)):
                         for _match_j in range(_match_index,len(list_match)):
                             if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:
                             if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:
                                 _match_j -= 1
                                 _match_j -= 1

+ 3 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -2525,8 +2525,8 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
     # 使用正则识别金额
     # 使用正则识别金额
     entity_type = "money"
     entity_type = "money"
     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
-                          "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|资金|(控制|拦标)价)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                          "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
     # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
     # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
 
 
@@ -2624,7 +2624,7 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
                 elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
                 elif re.search('^[-—]+[\d,.]+万元', sentence_text[end_index:]):
                     # print('两个金额连接后面的有单位,用后面单位')
                     # print('两个金额连接后面的有单位,用后面单位')
                     unit = '万元'
                     unit = '万元'
-                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:
+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
                     if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
                     if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
                         unit = '万元'
                         unit = '万元'
                         # print('金额较小且句子中有万元的,补充单位为万元')
                         # print('金额较小且句子中有万元的,补充单位为万元')

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -342,7 +342,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-12-18'}
+    version_date = {'version_date': '2024-01-15'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
 
     '''最终检查修正招标、中标金额'''
     '''最终检查修正招标、中标金额'''

+ 1 - 0
BiddingKG/dl/interface/modelFactory.py

@@ -96,6 +96,7 @@ class Model_role_classify_word():
         text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
         text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
         text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
         text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
         text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
         text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、', '', text)
+        text = re.sub('(采购|招标|发布)机构', '发布人', text)
         return text.replace('(', '(').replace(')', ')').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
         return text.replace('(', '(').replace(')', ')').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
 
 
     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):
     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):

+ 23 - 28
BiddingKG/dl/interface/predictor.py

@@ -1417,7 +1417,7 @@ class RoleRulePredictor():
 
 
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
         
-        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")  # |建安费用 不作为招标金额
+        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?为\w{2,4}资金|服务金额|采购成本价")  # |建安费用 不作为招标金额
         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
         self.pattern_money_other = re.compile("代理费|服务费")
         self.pattern_money_other = re.compile("代理费|服务费")
@@ -1476,6 +1476,8 @@ class RoleRulePredictor():
             _label = 5
             _label = 5
         elif _label == 2 and re.search('为$', before) and re.match('\w', after):  # 排除错误 前文为结尾,后文不是标点符号结尾的,如 353824459 供应商为社会团体的,  供应商为玉田县中医医院提供安保服务
         elif _label == 2 and re.search('为$', before) and re.match('\w', after):  # 排除错误 前文为结尾,后文不是标点符号结尾的,如 353824459 供应商为社会团体的,  供应商为玉田县中医医院提供安保服务
             _label = 5
             _label = 5
+        elif _label == 2 and re.search('评委|未中标', after[:5]): # 397194341 过滤掉错误召回中标人
+            _label = 5
         if _label == 5:
         if _label == 5:
             _label, _prob, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text)  # 前后文匹配
             _label, _prob, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text)  # 前后文匹配
             keyword = 'whole_'+ keyword[:keyword.find(entity_text)] if keyword!="" else keyword
             keyword = 'whole_'+ keyword[:keyword.find(entity_text)] if keyword!="" else keyword
@@ -1720,7 +1722,7 @@ class RoleRulePredictor():
                                 if re.search('(含|在|包括)(\d+)?$', _span[0]):
                                 if re.search('(含|在|包括)(\d+)?$', _span[0]):
                                     continue
                                     continue
                                 if re.search(',\w{2,}', _span[0]):
                                 if re.search(',\w{2,}', _span[0]):
-                                    _span[0] = _span[0].split(',')[-1]  #避免多个价格在一起造成误判
+                                    _span[0] = _span[0].split(',')[-1] if len(_span[0].split(',')[-1])>4 else _span[0][-8:] #避免多个价格在一起造成误判
                                 if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
                                 if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
                                         self.pattern_money_other, _span[0]) is None:
                                         self.pattern_money_other, _span[0]) is None:
                                     p_entity.values[0] = 0.8 + p_entity.values[0] / 10
                                     p_entity.values[0] = 0.8 + p_entity.values[0] / 10
@@ -1822,7 +1824,7 @@ class RoleRuleFinalAdd():
         text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
         text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
         text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:]  # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
         text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:]  # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
         # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
-        sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
+        sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
         sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
         sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
         sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
         sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
         if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
         if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
@@ -1858,7 +1860,7 @@ class RoleRuleFinalAdd():
                 ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
                 ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
 
 
                 if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
                 if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
-                                                  or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None) \
+                                                  or re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) == None) \
                         and ent_re not in agency_list and ent_re not in agency_set:
                         and ent_re not in agency_list and ent_re not in agency_set:
                     n = 0
                     n = 0
                     for i in range(len(ents) - 1, -1, -1):
                     for i in range(len(ents) - 1, -1, -1):
@@ -5572,8 +5574,8 @@ class TablePremExtractor(object):
             # tenderee = tenderee if self.is_role(tenderee) else ""
             # tenderee = tenderee if self.is_role(tenderee) else ""
             # tenderer = tenderer if self.is_role(tenderer) else ""
             # tenderer = tenderer if self.is_role(tenderer) else ""
 
 
-            tenderee = self.get_role(tenderee, self.nlp_enterprise)
-            tenderer = self.get_role(tenderer, self.nlp_enterprise)
+            tenderee = self.get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
+            tenderer = self.get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
 
 
             if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
             if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
                 break
                 break
@@ -5608,7 +5610,7 @@ class TablePremExtractor(object):
             prem_dic[package]['name'] = project_name
             prem_dic[package]['name'] = project_name
 
 
             if budget_ != "":
             if budget_ != "":
-                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
+                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     break
                     break
                 budget_header = headers['budget'][1] if 'budget' in headers else ''
                 budget_header = headers['budget'][1] if 'budget' in headers else ''
                 budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
                 budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
@@ -5638,7 +5640,7 @@ class TablePremExtractor(object):
                         "serviceTime": ""
                         "serviceTime": ""
                 })
                 })
             if tenderer and not same_package:
             if tenderer and not same_package:
-                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '',
+                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     break
                     break
 
 
@@ -5914,7 +5916,7 @@ class CandidateExtractor(object):
                     header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
                     header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
                     for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
                     for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
                                            [win_tenderer, second_tenderer, third_tenderer]):
                                            [win_tenderer, second_tenderer, third_tenderer]):
-                        if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '',
+                        if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
                                       text)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                                       text)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                             break
                             break
                         money, money_unit = money_process(text, header)
                         money, money_unit = money_process(text, header)
@@ -5949,7 +5951,7 @@ class CandidateExtractor(object):
                             'tendereeMoney': 0,
                             'tendereeMoney': 0,
                             'tendereeMoneyUnit': ""
                             'tendereeMoneyUnit': ""
                         }
                         }
-                    if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\d,.]', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符,中断匹配
+                    if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符,中断匹配
                         break
                         break
                     bid_amount, money_unit  = money_process(bid_amount_, headers['bid_amount'][1])  if "bid_amount" in headers else (0, "")
                     bid_amount, money_unit  = money_process(bid_amount_, headers['bid_amount'][1])  if "bid_amount" in headers else (0, "")
 
 
@@ -6499,24 +6501,17 @@ if __name__=="__main__":
     rs = product_attr.predict(docid='', html=html, page_time="")
     rs = product_attr.predict(docid='', html=html, page_time="")
     print(rs)
     print(rs)
 
 
-    # docid = ""
-    # title = ''
-    # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
-    #     html = f.read()
-    # tb_extract = TablePremExtractor()
-    # rs = tb_extract.predict(html, [
-    #     "河钢集团供应链管理有限公司邯郸分公司",
-    #     "石家庄中达科技有限公司",
-    #     "河北骥驰耐磨材料有限公司",
-    #     "衡水奥诺工矿机械设备有限公司",
-    #     "河北勤鹏机械设备科技有限公司",
-    #     "邯郸市华北不锈钢厂有限公司",
-    #     "邯郸市芳林机械备件制造有限公司",
-    #     "济南宏鲁新型材料有限公司",
-    #     "邯郸海博机械设备有限公司",
-    #     "河北万革新能源科技有限公司"
-    # ])
-    # print(rs)
+    docid = ""
+    title = ''
+    with open('d:/html/2.html', 'r', encoding='utf-8') as f:
+        html = f.read()
+    tb_extract = TablePremExtractor()
+    rs = tb_extract.predict(html, [
+        "广东省广裕集团嘉顺实业有限责任公司",
+        "广州顺为招标采购有限公司",
+        "中华人民共和国"
+    ])
+    print(rs)
 
 
     # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
     # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
     # # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
     # # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 0 - 0
BiddingKG/dl/interface/websource_tenderee.pkl


이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.