Bladeren bron

Merge remote-tracking branch 'origin/master'

fangjiasheng 11 maanden geleden
bovenliggende
commit
d130e695aa
100 gewijzigde bestanden met toevoegingen van 144 en 79 verwijderingen
  1. 1 1
      BiddingKG/dl/common/Utils.py
  2. 15 3
      BiddingKG/dl/entityLink/entityLink.py
  3. 10 3
      BiddingKG/dl/interface/Preprocessing.py
  4. 2 5
      BiddingKG/dl/interface/extract.py
  5. 3 2
      BiddingKG/dl/interface/getAttributes.py
  6. 1 1
      BiddingKG/dl/interface/modelFactory.py
  7. 112 64
      BiddingKG/dl/interface/predictor.py
  8. 0 0
      BiddingKG/dl/role/log/ep001-loss0.961-val_loss0.612-f10.713.h5
  9. 0 0
      BiddingKG/dl/role/log/ep002-loss0.603-val_loss0.459-f10.796.h5
  10. 0 0
      BiddingKG/dl/role/log/ep003-loss0.297-val_loss0.243-f10.918.h5
  11. 0 0
      BiddingKG/dl/role/log/ep003-loss0.494-val_loss0.415-f10.812.h5
  12. 0 0
      BiddingKG/dl/role/log/ep004-loss0.280-val_loss0.231-f10.920.h5
  13. 0 0
      BiddingKG/dl/role/log/ep004-loss0.448-val_loss0.386-f10.827.h5
  14. 0 0
      BiddingKG/dl/role/log/ep005-loss0.268-val_loss0.223-f10.921.h5
  15. 0 0
      BiddingKG/dl/role/log/ep005-loss0.410-val_loss0.337-f10.864.h5
  16. 0 0
      BiddingKG/dl/role/log/ep006-loss0.260-val_loss0.215-f10.923.h5
  17. 0 0
      BiddingKG/dl/role/log/ep006-loss0.363-val_loss0.303-f10.886.h5
  18. 0 0
      BiddingKG/dl/role/log/ep007-loss0.252-val_loss0.209-f10.924.h5
  19. 0 0
      BiddingKG/dl/role/log/ep007-loss0.307-val_loss0.225-f10.924.h5
  20. 0 0
      BiddingKG/dl/role/log/ep008-loss0.101-val_loss0.175-f1_score0.942.h5
  21. 0 0
      BiddingKG/dl/role/log/ep008-loss0.118-val_loss0.171-f1_score0.937.h5
  22. 0 0
      BiddingKG/dl/role/log/ep008-loss0.243-val_loss0.204-f10.925.h5
  23. 0 0
      BiddingKG/dl/role/log/ep008-loss0.263-val_loss0.205-f10.929.h5
  24. 0 0
      BiddingKG/dl/role/log/ep009-loss0.110-val_loss0.172-f1_score0.941.h5
  25. 0 0
      BiddingKG/dl/role/log/ep009-loss0.240-val_loss0.199-f10.926.h5
  26. 0 0
      BiddingKG/dl/role/log/ep009-loss0.243-val_loss0.199-f10.930.h5
  27. 0 0
      BiddingKG/dl/role/log/ep010-loss0.234-val_loss0.196-f10.926.h5
  28. 0 0
      BiddingKG/dl/role/log/ep011-loss0.217-val_loss0.184-f10.938.h5
  29. 0 0
      BiddingKG/dl/role/log/ep011-loss0.231-val_loss0.193-f10.927.h5
  30. 0 0
      BiddingKG/dl/role/log/ep012-loss0.206-val_loss0.174-f10.939.h5
  31. 0 0
      BiddingKG/dl/role/log/ep012-loss0.227-val_loss0.190-f10.928.h5
  32. 0 0
      BiddingKG/dl/role/log/ep013-loss0.224-val_loss0.188-f10.929.h5
  33. 0 0
      BiddingKG/dl/role/log/ep014-loss0.189-val_loss0.164-f10.944.h5
  34. 0 0
      BiddingKG/dl/role/log/ep014-loss0.220-val_loss0.185-f10.930.h5
  35. 0 0
      BiddingKG/dl/role/log/ep015-loss0.180-val_loss0.161-f10.942.h5
  36. 0 0
      BiddingKG/dl/role/log/ep015-loss0.216-val_loss0.184-f10.931.h5
  37. 0 0
      BiddingKG/dl/role/log/ep016-loss0.215-val_loss0.181-f10.931.h5
  38. 0 0
      BiddingKG/dl/role/log/ep017-loss0.175-val_loss0.157-f10.945.h5
  39. 0 0
      BiddingKG/dl/role/log/ep017-loss0.210-val_loss0.180-f10.932.h5
  40. 0 0
      BiddingKG/dl/role/log/ep018-loss0.172-val_loss0.150-f10.945.h5
  41. 0 0
      BiddingKG/dl/role/log/ep018-loss0.209-val_loss0.179-f10.932.h5
  42. 0 0
      BiddingKG/dl/role/log/ep019-loss0.208-val_loss0.177-f10.933.h5
  43. 0 0
      BiddingKG/dl/role/log/ep020-loss0.206-val_loss0.176-f10.934.h5
  44. 0 0
      BiddingKG/dl/role/log/ep021-loss0.159-val_loss0.148-f10.946.h5
  45. 0 0
      BiddingKG/dl/role/log/ep021-loss0.204-val_loss0.175-f10.936.h5
  46. 0 0
      BiddingKG/dl/role/log/ep022-loss0.200-val_loss0.173-f10.937.h5
  47. 0 0
      BiddingKG/dl/role/log/ep022-loss0.579-val_loss0.548-f1_score0.805.h5
  48. 0 0
      BiddingKG/dl/role/log/ep023-loss0.199-val_loss0.173-f10.937.h5
  49. 0 0
      BiddingKG/dl/role/log/ep024-loss0.156-val_loss0.146-f10.947.h5
  50. 0 0
      BiddingKG/dl/role/log/ep024-loss0.198-val_loss0.171-f10.938.h5
  51. 0 0
      BiddingKG/dl/role/log/ep025-loss0.149-val_loss0.146-f10.948.h5
  52. 0 0
      BiddingKG/dl/role/log/ep025-loss0.197-val_loss0.170-f10.938.h5
  53. 0 0
      BiddingKG/dl/role/log/ep026-loss0.149-val_loss0.143-f10.948.h5
  54. 0 0
      BiddingKG/dl/role/log/ep026-loss0.195-val_loss0.170-f10.939.h5
  55. 0 0
      BiddingKG/dl/role/log/ep027-loss0.194-val_loss0.168-f10.940.h5
  56. 0 0
      BiddingKG/dl/role/log/ep028-loss0.192-val_loss0.168-f10.940.h5
  57. 0 0
      BiddingKG/dl/role/log/ep029-loss0.141-val_loss0.136-f10.949.h5
  58. 0 0
      BiddingKG/dl/role/log/ep029-loss0.189-val_loss0.168-f10.939.h5
  59. 0 0
      BiddingKG/dl/role/log/ep030-loss0.189-val_loss0.167-f10.940.h5
  60. 0 0
      BiddingKG/dl/role/log/ep030-loss0.504-val_loss0.477-f1_score0.838.h5
  61. 0 0
      BiddingKG/dl/role/log/ep031-loss0.187-val_loss0.166-f10.940.h5
  62. 0 0
      BiddingKG/dl/role/log/ep032-loss0.188-val_loss0.165-f10.940.h5
  63. 0 0
      BiddingKG/dl/role/log/ep033-loss0.184-val_loss0.165-f10.941.h5
  64. 0 0
      BiddingKG/dl/role/log/ep034-loss0.137-val_loss0.135-f10.950.h5
  65. 0 0
      BiddingKG/dl/role/log/ep034-loss0.184-val_loss0.165-f10.941.h5
  66. 0 0
      BiddingKG/dl/role/log/ep035-loss0.135-val_loss0.135-f10.950.h5
  67. 0 0
      BiddingKG/dl/role/log/ep035-loss0.183-val_loss0.164-f10.941.h5
  68. 0 0
      BiddingKG/dl/role/log/ep036-loss0.181-val_loss0.164-f10.941.h5
  69. 0 0
      BiddingKG/dl/role/log/ep037-loss0.182-val_loss0.163-f10.941.h5
  70. 0 0
      BiddingKG/dl/role/log/ep038-loss0.134-val_loss0.132-f10.952.h5
  71. 0 0
      BiddingKG/dl/role/log/ep038-loss0.179-val_loss0.162-f10.942.h5
  72. 0 0
      BiddingKG/dl/role/log/ep039-loss0.120-val_loss0.196-f1_score0.933.h5
  73. 0 0
      BiddingKG/dl/role/log/ep041-loss0.128-val_loss0.128-f10.953.h5
  74. 0 0
      BiddingKG/dl/role/log/ep041-loss0.179-val_loss0.162-f10.942.h5
  75. 0 0
      BiddingKG/dl/role/log/ep042-loss0.176-val_loss0.161-f10.942.h5
  76. 0 0
      BiddingKG/dl/role/log/ep043-loss0.158-val_loss0.207-f1_score0.925.h5
  77. 0 0
      BiddingKG/dl/role/log/ep044-loss0.175-val_loss0.161-f10.943.h5
  78. 0 0
      BiddingKG/dl/role/log/ep045-loss0.175-val_loss0.161-f10.943.h5
  79. 0 0
      BiddingKG/dl/role/log/ep046-loss0.175-val_loss0.160-f10.943.h5
  80. 0 0
      BiddingKG/dl/role/log/ep047-loss0.174-val_loss0.160-f10.942.h5
  81. 0 0
      BiddingKG/dl/role/log/ep048-loss0.122-val_loss0.125-f10.954.h5
  82. 0 0
      BiddingKG/dl/role/log/ep048-loss0.126-val_loss0.123-f10.955.h5
  83. 0 0
      BiddingKG/dl/role/log/ep048-loss0.171-val_loss0.160-f10.943.h5
  84. 0 0
      BiddingKG/dl/role/log/ep049-loss0.171-val_loss0.159-f10.943.h5
  85. 0 0
      BiddingKG/dl/role/log/ep051-loss0.170-val_loss0.158-f10.943.h5
  86. 0 0
      BiddingKG/dl/role/log/ep053-loss0.114-val_loss0.136-f10.952.h5
  87. 0 0
      BiddingKG/dl/role/log/ep054-loss0.168-val_loss0.158-f10.943.h5
  88. 0 0
      BiddingKG/dl/role/log/ep055-loss0.168-val_loss0.158-f10.944.h5
  89. 0 0
      BiddingKG/dl/role/log/ep056-loss0.104-val_loss0.174-f1_score0.943.h5
  90. 0 0
      BiddingKG/dl/role/log/ep057-loss0.165-val_loss0.157-f10.943.h5
  91. 0 0
      BiddingKG/dl/role/log/ep060-loss0.164-val_loss0.157-f10.944.h5
  92. 0 0
      BiddingKG/dl/role/log/ep061-loss0.165-val_loss0.157-f10.944.h5
  93. 0 0
      BiddingKG/dl/role/log/ep062-loss0.111-val_loss0.123-f10.955.h5
  94. 0 0
      BiddingKG/dl/role/log/ep062-loss0.163-val_loss0.157-f10.943.h5
  95. 0 0
      BiddingKG/dl/role/log/ep064-loss0.162-val_loss0.156-f10.943.h5
  96. 0 0
      BiddingKG/dl/role/log/ep064-loss0.585-val_loss0.634-f1_score0.927.h5
  97. 0 0
      BiddingKG/dl/role/log/ep065-loss0.162-val_loss0.156-f10.943.h5
  98. 0 0
      BiddingKG/dl/role/log/ep066-loss0.162-val_loss0.156-f10.944.h5
  99. 0 0
      BiddingKG/dl/role/log/ep067-loss0.160-val_loss0.156-f10.944.h5
  100. 0 0
      BiddingKG/dl/role/log/ep068-loss0.161-val_loss0.156-f10.944.h5

+ 1 - 1
BiddingKG/dl/common/Utils.py

@@ -438,7 +438,7 @@ def getUnifyMoney(money):
     result = Decimal(0)
     chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
     # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
-    chnFactorUnits = ["圆", "元","兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千']
+    chnFactorUnits = ["兆", "亿", "万", "仟", '千', "佰", '百', "拾", '十',"圆", "元", "角", "分"]  # 20240611 修复大写提取错误 '陆拾陆亿伍千柒佰零叁万肆千叁佰陆拾伍元' Decimal('11607430365')
     
     LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
     BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))

+ 15 - 3
BiddingKG/dl/entityLink/entityLink.py

@@ -160,11 +160,16 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
         bus_tenderee = []
         for _entity in list_entity:
             if _entity.entity_type in ["org","company"]:
+                ser = re.search('(?P<name>.{2,}(医院|大学|公司))(招[投议]?标|采购)(中心|办公室)$', _entity.entity_text) # 2024-06-07 规范单位名称,去除非必要字眼
+                if ser:
+                    _entity.entity_text = ser.group('name')
                 range_entity.append(_entity)
                 if _entity.entity_text in bus_dic:
                     have_bus = True
                 else:
                     have_bus, dic = get_business_data(_entity.entity_text)
+                    if re.search('^\w{,5}[分支](行|公司)$|^\w{1,3}公司$|^\w{2,5}段$', _entity.entity_text):
+                        have_bus = False
                     if have_bus:
                         lb, prob = get_role(dic)
                         bus_dic[_entity.entity_text] = (lb, prob)
@@ -175,6 +180,8 @@ def link_entitys(list_entitys,on_value=1):#on_value=0.81
                         bus_dic[_entity.entity_text] = (0, 0.5)
                 if have_bus: # 20231115 改为只判断是否有工商数据,没有就考虑替换
                     long_entity.append(_entity)
+                    if len(_entity.entity_text)< 6 and re.search('(大学|医院)', _entity.entity_text) == None:
+                        short_entity.append(_entity)
                     lb, prob = bus_dic[_entity.entity_text]
                     if lb in [0,1] and prob>0.9 and _entity.label in [0, 1] and _entity.values[_entity.label]<0.55: # 如果工商统计概率较高,文中概率较低,换为统计类别,主要为标题及发布人等招标、代理划分不明确情况
                         if _entity.label != lb:
@@ -514,9 +521,9 @@ def match_enterprise_max_first(sentence):
                         enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
                         if re.search('[\u4e00-\u9fa5]', enter_tail) == None: # 20240111不包含中文后缀不要
                             continue
-                        elif enter_name in ['黄埔军校',  '五金建材', '铝合金门窗', '测试单位' ,'生产管理部']: # '国有资产管理处',
+                        elif enter_name in ['黄埔军校',  '五金建材', '铝合金门窗', '测试单位' ,'生产管理部', '华电XXX发电有限公司']: # '国有资产管理处',
                             continue
-                        elif re.search('^\w{,3}(有限)?(责任)?分?公司$|^第[一二三四五六七八九十](工程|建筑)?分?公司$', enter_name):
+                        elif re.search('^\w{,3}(有限)?(责任)?分?公司$|^第[一二三四五六七八九十](工程|建筑)?分?公司$|交汇处$|大厦$|大楼$|^华电X{1,4}发电有限公司$', enter_name):
                             continue
                         if len(enter_name)<4: # 20240521 短于4个字的不要
                             break
@@ -582,7 +589,12 @@ def calibrateEnterprise(list_articles,list_sentences,list_entitys):
                     if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
                         find_flag = True
                         # 判断是否是多个公司
-                        if re.search('[分支](公司|中心|部|行)|大学附属\w{,6}医院', p_entity.entity_text):
+                        if re.search('[分支](公司|中心|监狱|部|行)|^\w{4,15}公司\w{2,3}公司$'
+                                     '|(大学|学院)\w{,2}附属\w{,6}医院$|(\w{2,5}办事处\w{2,6}$'
+                                     '|\w{2,4}[省市县]\w{2,14}村)(股份)?经济(合作|联合)社$|国家税务总局\w{2,10}税务局$',
+                                     p_entity.entity_text):
+                            continue
+                        if p_entity.entity_type == "location" and re.search('\d[楼室号]', p_entity.entity_text):  # 明确地址不进行替换避免 类似 434052508 西宁市城西区西关大街128号山东大厦15楼1152室 更新为 西宁市城西
                             continue
                         for _match_j in range(_match_index,len(list_match)):
                             if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:

+ 10 - 3
BiddingKG/dl/interface/Preprocessing.py

@@ -2624,6 +2624,8 @@ def special_treatment(sourceContent, web_source_no):
                 sourceContent = re.sub('排名:\d,', '候选', sourceContent)
         elif web_source_no=='DX000726-6':
             sourceContent = re.sub('卖方[::\s]+宝山钢铁股份有限公司', '招标单位:宝山钢铁股份有限公司', sourceContent)
+        elif web_source_no=='DX008791-1':
+            sourceContent = re.sub('收货单位:', '最终用户:', sourceContent)
         return sourceContent
     except Exception as e:
         log('特殊数据源: %s 预处理特别修改抛出异常: %s'%(web_source_no, e))
@@ -2993,6 +2995,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub(',最高有效报价者:', ',中标人名称:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         article_processed = re.sub(',最高有效报价:', ',投标报价:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         article_processed = re.sub('备选中标人', '第二候选人', article_processed)  # 341344142 # 2023/7/17 特殊表达修改
+        article_processed = re.sub('例:建设银行(甲方全称)', ' ', article_processed)  # 2024/06/12 特殊表达修改 修改 481513912 金采网 附件模板导致错误提取招标人
         if web_source_no.startswith('DX002756-'):
             article_processed = re.sub('状态:(进行中|已结束)单位', ',项目单位', article_processed)  # 376225646
         if web_source_no.startswith('DX006116-') and re.search('结果公告如下:.{5,50},单位名称:', article_processed):  # 2023/11/20 特殊处理 381591924 381592533 这种提取不到情况
@@ -3025,7 +3028,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
             article_processed_list[1] = attachment_text
             article_processed = "##attachment##".join(article_processed_list)
         '''特别数据源对 预处理后文本 做特别修改'''
-        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2", '00811-8', '03795-1', '03795-2', 'DX000726-6']:
+        if web_source_no in ['03786-10', '00076-4', 'DX000105-2', '04080-3', '04080-4', '03761-3', '00695-7',"13740-2", '00811-8', '03795-1', '03795-2', 'DX000726-6','DX008791-1']:
             article_processed = special_treatment(article_processed, web_source_no)
 
         # 提取bidway
@@ -3450,7 +3453,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
     '''
 
     list_entitys = []
-    not_extract_roles = ['黄埔军校', '国有资产管理处', '五金建材', '铝合金门窗'] # 需要过滤掉的企业单位
+    not_extract_roles = ['黄埔军校', '国有资产管理处', '五金建材', '铝合金门窗', '华电XX发电有限公司', '华电XXX发电有限公司'] # 需要过滤掉的企业单位
     for list_sentence in list_sentences:
         sentences = []
         list_entitys_temp = []
@@ -3514,7 +3517,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     ner_entitys.append((b, e, 'company', entity))
 
             for it in re.finditer(
-                    '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为::]+)(?P<text>\w{2,4}[省市县区镇]([()\w]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|海关|殡仪馆))[,。]',
+                    '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为::]+)(?P<text>\w{2,4}[省市县区镇]([()\w]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|海关|殡仪馆)|海门\w{2,15}村)[,。]',
                     sentence_text):
                 for k, v in it.groupdict().items():
                     if k == 'text_key_word':
@@ -3543,6 +3546,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_type = ner_entity[2]
                 entity_text = ner_entity[3]
 
+                if entity_type == 'location' and re.search('^\w{2,4}[市县]\w{3,15}(中心|监狱|殡仪馆)$', entity_text) and \
+                    re.search('\d[楼层号]', entity_text)==None: # 2024/06/07 修改错误地址实体为角色
+                    entity_type = 'org'
+
                 if entity_text.startswith('石山县'): # 2024/04/24 修复实体识别积石山县 识别少字问题
                     entity_text = '积' + entity_text
                     if 0<=begin_index_temp-1<len(sentence_text) and sentence_text[begin_index_temp-1] == '积':

+ 2 - 5
BiddingKG/dl/interface/extract.py

@@ -233,7 +233,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     '''规则调整角色概率'''
     start_time = time.time() #
-    predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys)
+    predictor.getPredictor("rolegrade").predict(list_sentences,list_entitys,original_docchannel)
     cost_time["rolegrade"] = round(time.time()-start_time,2)
 
     '''规则调整金额概率'''
@@ -342,9 +342,6 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''根据district提取结果修复实体'''
     repair_entity(prem,district,list_articles)
 
-    # '''限制行业最高金额'''
-    # getAttributes.limit_maximum_amount(prem, industry) # 20230703取消,改为整合所有要素后面纠正
-
     '''根据数据源最后召回招标人角色'''
     prem = predictor.getPredictor('websource_tenderee').get_websource_tenderee(web_source_no, web_source_name, prem)
 
@@ -368,7 +365,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-06-05'}
+    version_date = {'version_date': '2024-06-18'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 3 - 2
BiddingKG/dl/interface/getAttributes.py

@@ -3715,6 +3715,7 @@ def limit_maximum_amount(dic, list_entity):
         '工程监理服务': 100000000,
         '工程造价服务': 100000000,
         '会计、审计及税务服务': 100000000,
+        '其他专业咨询与调查': 100000000
     }
     title = dic.get('doctitle_refine', '')
     name = dic.get('name', '')
@@ -3878,14 +3879,14 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
                                             e2 = behind_entity.wordOffset_end
                                             if _entity.sentence_index == behind_entity.sentence_index and behind_entity.entity_type in ['org', 'company'] \
                                                     and b2-e<10 and re.search('联合(体|投标人):|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[,;]成:|(成)$', s[b2-e:b2]) or \
-                                                re.search('(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]):
+                                                re.search('(联合(体|投标人))|(联合体(成员|单位)方?[12345一二三四五]?)|((联合体)?成员单位[12345一二三四五]?)|(特殊普通合伙|成员?)|^(成[),]$', s[e2:e2+10]) and behind_entity.label in [2, 5]:
                                                 join_l.append(behind_entity.entity_text)
                                                 b = b2
                                                 e = e2
                                             else:
                                                 break
                                         if len(join_l)>1:
-                                            d['win_tenderer_joint'] = ','.join(join_l)
+                                            d['win_tenderer_joint'] = ','.join(set(join_l))
 
 
 

+ 1 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -102,7 +102,7 @@ class Model_role_classify_word():
         text = re.sub('(采购|招标)人名称、地址和联系方式:|采购方,指', '采购人:', text) # 275065998  修复 224703143 采购的中标人;采购方,指 预测为中标
         if re.search('(最终)?排名:', text) and re.search('(最终)?排名:第?[123一二三]', text)==None:
             text = re.sub('(最终)?排名:', '    ', text)
-        # text = re.sub('(采购|招标|发布)机构', '发布人', text)
+        text = re.sub('交易单位', '发布单位', text)
         return text.replace('(', '(').replace(')', ')').replace('單', '单').replace('稱','承').replace('標', '标').replace('採購', '采购').replace('機構', '机构')
 
     def encode_word(self, sentence_text, begin_index, end_index, size=20, **kwargs):

+ 112 - 64
BiddingKG/dl/interface/predictor.py

@@ -857,6 +857,8 @@ class PREMPredict():
                 elif re.search('合同供方:?$|合同签约单位', front):
                     label = 0
                     values[label] = 0.5
+                elif re.search('现由$', front) and re.search('^作为\d个单位的牵头(单位|公司)?', behind): # 修复 469369884 站源批量预测错误 现由第七合同段保利长大工程有限公司作为6个单位的牵头单位,
+                    label = 5
             elif re.search('是否中标:是,供应商', front) and label == 5:
                 label = 2
                 values[label] = 0.9
@@ -1389,7 +1391,7 @@ class RoleRulePredictor():
                                      "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
                                      "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$|(采购商|招标人):(\w{2,10}-)?$)"
         self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
-        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
+        self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束)|^([拟须需]|计划)(采购|招标|购置|购买)|^须购[买置]一批|作为(采购|招标)(人|单位)|^关于)"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
         self.pattern_agency_left = "(?P<agency_left>((代理|拍卖)(?:人|机构|公司|企业|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构|(采购|招标)代理)(名称|.{,4}名,?称|全称)?(是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
@@ -2231,7 +2233,7 @@ class RoleGrade():
         self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
         self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9,
                              self.winTenderer_left_6, self.winTenderer_left_9,self.winTenderer_left_8, self.secondTenderer_left_9, self.thirdTenderer_left_9]
-    def predict(self, list_sentences, list_entitys, span=15, min_prob=0.7):
+    def predict(self, list_sentences, list_entitys, original_docchannel, span=15, min_prob=0.7):
         '''
         根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
         修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
@@ -3154,10 +3156,16 @@ class ProductAttributesPredictor():
                             _budget = col1_l[i]
                             re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", _budget)
                             if re_price:
-                                _budget = re_price[0]
-                                if '万元' in col0_l[i] and '万' not in _budget:
-                                    _budget += '万元'
-                                budget = str(getUnifyMoney(_budget))
+                                # _budget = re_price[0]
+                                # if '万元' in col0_l[i] and '万' not in _budget:
+                                #     _budget += '万元'
+                                # budget = str(getUnifyMoney(_budget))
+                                _budget, _money_unit = money_process(_budget, col0_l[i])
+                                budget = str(_budget)
+                                if '.' in budget:
+                                    budget = budget.rstrip('0').rstrip('.')
+                                if float(budget)>= 500*100000000:
+                                    budget = ""
                         elif re.search('预算单位|(采购|招标|购买)(单位|人|方|主体)|项目业主|采购商|申购单位|需求单位|业主单位', col0_l[i]):
                             header_list2.append(col0_l[i])
                             tenderee = re.sub("\s","",col1_l[i])
@@ -3180,7 +3188,7 @@ class ProductAttributesPredictor():
                         if order_begin_year>=2050 or order_end_year>=2050:
                             order_begin = order_end = ""
                     # print(product,demand,budget,order_begin)
-                    if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要
+                    if product!= "" and demand != "" and budget!="" and order_begin != "":
                         link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
                                 'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee, 'notes':notes, 'issue_date':issue_date}
                         if link not in demand_link:
@@ -3696,13 +3704,19 @@ class ProductAttributesPredictor():
                                         _unitPrice = deal_list[id3]
                                         re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
                                         if re_price:
-                                            _unitPrice = re_price[0]
-                                            if '万元' in header_list[3] and '万' not in _unitPrice:
-                                                _unitPrice += '万元'
-                                            unitPrice = getUnifyMoney(_unitPrice)
-                                            if unitPrice>=10000*10000:
-                                                unitPrice = ""
-                                            unitPrice = str(unitPrice)
+                                            # _unitPrice = re_price[0]
+                                            # if '万元' in header_list[3] and '万' not in _unitPrice:
+                                            #     _unitPrice += '万元'
+                                            # unitPrice = getUnifyMoney(_unitPrice)
+                                            # if unitPrice>=10000*10000:
+                                            #     unitPrice = ""
+                                            # unitPrice = str(unitPrice)
+                                            _unitPrice, _money_unit = money_process(_unitPrice, header_list[3])
+                                            if _unitPrice >= 10000 * 10000:
+                                                _unitPrice = ""
+                                            unitPrice = str(_unitPrice)
+                                            if '.' in unitPrice:
+                                                unitPrice = unitPrice.rstrip('0').rstrip('.')
                                 if id4 != "":
                                     if re.search('\w', deal_list[id4]):
                                         brand = deal_list[id4]
@@ -3727,10 +3741,14 @@ class ProductAttributesPredictor():
                                         _budget = deal_list[id7]
                                         re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_budget)
                                         if re_price:
-                                            _budget = re_price[0]
-                                            if '万元' in header_list2[2] and '万' not in _budget:
-                                                _budget += '万元'
-                                            budget = str(getUnifyMoney(_budget))
+                                            # _budget = re_price[0]
+                                            # if '万元' in header_list2[2] and '万' not in _budget:
+                                            #     _budget += '万元'
+                                            # budget = str(getUnifyMoney(_budget))
+                                            _budget, _money_unit = money_process(_budget, header_list2[2])
+                                            budget = str(_budget)
+                                            if '.' in budget:
+                                                budget = budget.rstrip('0').rstrip('.')
                                             if float(budget)>= 100000*10000:
                                                 budget = ""
                                 if id8 != "":
@@ -3863,10 +3881,13 @@ class ProductAttributesPredictor():
 
 
     def add_product_attrs(self,channel_dic, product_attrs,  list_sentences,list_entitys,list_outlines,product_list,codeName,prem,text,page_time):
+        # print(1,product_attrs[1]['demand_info']['data'])
         if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
             product_attrs = self.predict_without_table(product_attrs, list_sentences,list_entitys,codeName,prem,text,page_time)
+        # print(2,product_attrs[1]['demand_info']['data'])
         if len(product_attrs[0]['product_attrs']['data']) == 0:
             product_attrs = self.predict_by_text(product_attrs,text,list_outlines,product_list,page_time)
+        # print(3,product_attrs[1]['demand_info']['data'])
         if len(product_attrs[1]['demand_info']['data'])>0:
             for d in product_attrs[1]['demand_info']['data']:
                 for product in set(prem[0]['product']):
@@ -3897,8 +3918,8 @@ class DocChannel():
       self.type_dic = {
           '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
           '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
-          '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
-          '采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;'  # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)|
+          '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)|看样(时间|地[点址]|方式)|最小加价|加价幅度',
+          '采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;|采购需求清单|最低价排序|竞争性采购方式|采购进行公开竞价|竞价模式[::\s]*一次报价|预算金额'  # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)|
       }
 
       self.title_type_dic = {
@@ -3921,12 +3942,12 @@ class DocChannel():
           '候选人公示': '候选人公示|评标结果公示|中标候选人名单公示|现将中标候选人(进行公示|公[示布]如下)|(中标|中选)候选人(信息|情况)[::\s]',
           '候选人公示neg': '中标候选人公示期|中标候选人公示前',
           '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(人|成交)|成交)\w{,3}(信息|情况)[::\s]',
-          '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源(采购|招标)?的?(中标|成交|结果)|项目已结束', # |单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示
-          '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(信息[,:]?)?(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]', # |唯一
+          '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源(采购|招标)?的?(中标|成交|结果)|项目已结束|中标公示 ', # |单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示
+          '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(信息[,:]?)?(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]|确定[\w()]{6,25}为中标人', # |唯一
           '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家|通知中标单位|影响(成交|中标)结果',
       # |确定成交供应商[:,\s]
           '合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|完成(日期|时间))|(供应商乙方|乙方供应商):|合同总?金额|履约信息',
-          '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
+          '废标公告': '(终止|中止|废标|流标|流采|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
           '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)|成交情况:\s*[流废]标|现予以废置',
           '废标公告neg': '超过此报价将作为[废流]标处理|否则按[废流]标处理|终止规则:|成交规则:|视为流标|竞价失败的一切其他情形'
       }
@@ -6044,7 +6065,7 @@ class DistrictPredictor():
                 return ''
 
         def get_project_addr(text):
-            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
             if re.search(p1, text):
                 return re.search(p1, text).group('addr')
             else:
@@ -7125,17 +7146,20 @@ class WebsourceTenderee():
 
 class ApprovalPredictor():
     def __init__(self):
+        '''
+        项目(法人)单位
+        '''
         self.other_part = {
-            "project_name": "(项目|工程|采购|招标)名称:(?P<main>[^:。]{5,50})[,。](\w{2,10}:|$)?",
-            "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标)编[号码]):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}:|$)?",
-            "doc_num": "((审[批查]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案)文号|综合受理号):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)[,。]?(\w{2,10}:|$)?",
-            "pro_type": "(申报类型|项目所属行业):(?P<main>[^:。]{2,30})[,。](\w{2,10}:|$)?",
-            "year_limit": "((建设|工程|服务)年限):(?P<main>[\d个年月日.-]{2,20})[,。](\w{2,10}:|$)?",
-            "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|建设规模(如下)?):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
-            "approval_items": "((审[批查]|批[复准])事项|事项名称):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
-            "properties": "((建设|工程)性质):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
-            "approval_result": "((审[批查]|批[复准])(结果|决定)):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?",
-            "phone": "联系电话:(?P<main>1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|"
+            "project_name": "(项目|工程|采购|招标|计划)名称?:(?P<main>[^:。]{5,50})[,。](\w{2,10}:|$)?", # 项目名称
+            "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案)(编[号码]|号)):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}:|$)?", # 项目编号
+            "doc_num": "((审[批查]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案|核准|许可|确认)[编]?号|综合受理号|文书号):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-.]{5,30}号?)[,。]?(\w{2,10}:|$)?", # 文号
+            "pro_type": "(申[请](类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业|项目类型|立项类型):(?P<main>[^:。]{2,30})[,。](\w{2,10}:|$)?", # 项目类型
+            "year_limit": "((建设|工程|服务|项目)(年限|期限|时长)):(?P<main>[\d个年月日.-]{2,20})[,。](\w{2,10}:|$)?", # 建设年限
+            "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|工程|项目)规模(如下)?):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 建设规模
+            "approval_items": "((审[批查]|批[复准]申请)(事项|内容)|事项名称|事项审批):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 审批事项
+            "properties": "((建设|工程|项目)性质):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 建设性质
+            "approval_result": "((审[批查]|批[复准])(结果|决定|结论|状态|回复)|(办理|,)(状态|意见|结果)):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 审批结果
+            "phone": "(联系)?电话:(?P<main>1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|" # 联系电话
                      '\+86.?1[3-9]\d{9}|'
                      '0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
                      '0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?(?=1[3-9]\d{9})|'
@@ -7148,26 +7172,26 @@ class ApprovalPredictor():
         }
 
         self.role_type = {
-            "declare_company": "申报(部门|机关|单位|企业|公司)",  # 申报单位
-            "construct_company": "(业主|建设|用地))?(部门|机关|单位|企业|公司)|主送机关|法人单位",  # 建设单位
-            "approver": "(审批|许可|批准|发证|批复|管理)(部门|机关|单位|企业|公司)",  # 审批部门
-            "evaluation_agency": "环境影响评价机构|环评机构|评价机构|环评单位"  # 环评机构
+            "declare_company": "([请]|填报|呈报)(部门|机关|单位|企业|公司|机构|组织)",  # 申报单位
+            "construct_company": "(业主|建设|用地|委托|发包|产权|项目))?(部门|机关|单位|企业|公司|方)|主送机关|法人单位|甲方",  # 建设单位
+            "approver": "(审[查核]|许可|批准|发证|批复|管理)(部门|机关|单位|企业|公司|机构)",  # 审批部门
+            "evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)"  # 环评机构
         }
         self.person_type = {
-            "legal_person": "项目法人|法定代表人"  # 项目法人
+            "legal_person": "项目法人|法定代表人|企业法人"  # 项目法人
         }
         self.date_type = {
-            "time_declare": "申报时间",
-            "time_commencement": "开工时间",
-            "time_completion": "竣工时间"
+            "time_declare": "([请]|填报|呈报)(时间|日期)", # 申报时间
+            "time_commencement": "(开工|动工|施工开始)(时间|日期)", # 开工时间
+            "time_completion": "(竣工|完工|验收|(项目|建设|工程)(完成|结束))(备案)?(时间|日期)" # 竣工时间
         }
 
         self.addr_type = {
-            "project_addr": "(建设|工程|项目)(地址|地点|位置)"
+            "project_addr": "(建设|工程|项目|施工)(地址|地点|位置|所在地)|[宗土]地坐落|用地位置" # 建设地址
         }
 
         self.money_type = {
-            "total_tendereeMoney": "项目金额|项目投资|总投资|投资总额|总预算|总概算|投资规模|批复概算|投资额",
+            "total_tendereeMoney": "(项目|概算|投资)金额|项目投资|总投资|总预算|总概算|投资(规模|总额|估算|概算)|批复概算|投资额", # 总投资
         }
 
     def predict(self, list_sentences, list_entitys, span=12):
@@ -7187,67 +7211,91 @@ class ApprovalPredictor():
                              self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
             multi_project['moneysource'] = ''
             text = sentences[i]
-            for k, v in self.other_part.items():
-                for iter in re.finditer(v, text):
-                    rs_dic[k] = iter.group('main')
-                    multi_project[k] = iter.group('main')
-                    found_key = 1
-                    break
             for entity in entities[i]:
                 b, e = entity.wordOffset_begin, entity.wordOffset_end
                 if entity.entity_type in ['org', 'company']:
                     for k, v in self.role_type.items():
                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
-                            rs_dic[k] = entity.entity_text
+                            if rs_dic[k] == '':
+                                rs_dic[k] = entity.entity_text
                             multi_project[k] = entity.entity_text
                             found_key = 1
                 elif entity.entity_type in ['person']:
                     for k, v in self.person_type.items():
                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
-                            rs_dic[k] = entity.entity_text
+                            if rs_dic[k] == '':
+                                rs_dic[k] = entity.entity_text
                             multi_project[k] = entity.entity_text
                             found_key = 1
                             break
                 elif entity.entity_type in ['time']:
                     for k, v in self.date_type.items():
                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
-                            rs_dic[k] = entity.entity_text
+                            if rs_dic[k] == '':
+                                rs_dic[k] = entity.entity_text
                             multi_project[k] = entity.entity_text
                             found_key = 1
                 elif entity.entity_type in ['location']:
                     for k, v in self.addr_type.items():
                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
-                            rs_dic[k] = entity.entity_text
+                            if rs_dic[k] == '':
+                                rs_dic[k] = entity.entity_text
                             multi_project[k] = entity.entity_text
                             found_key = 1
                 elif entity.entity_type in ['money']:
                     for k, v in self.money_type.items():
                         if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
-                            rs_dic[k] = entity.entity_text
+                            if rs_dic[k] == '':
+                                rs_dic[k] = entity.entity_text
                             multi_project[k] = entity.entity_text
                             found_key = 1
                 elif entity.entity_type in ['moneysource']:
                     rs_dic['moneysource'] = turnMoneySource(entity.entity_text)
                     multi_project['moneysource'] = turnMoneySource(entity.entity_text)
+                elif entity.entity_type in ['code']:
+                    k = 'project_code'
+                    v = self.other_part[k].split(':', maxsplit=1)[0]
+                    if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
+                        if rs_dic[k] == '':
+                            rs_dic[k] = entity.entity_text
+                        multi_project[k] = entity.entity_text
+                        found_key = 1
+                elif entity.entity_type in ['name']:
+                    k = 'project_name'
+                    v = self.other_part[k].split(':', maxsplit=1)[0]
+                    if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
+                        if rs_dic[k] == '':
+                            rs_dic[k] = entity.entity_text
+                        multi_project[k] = entity.entity_text
+                        found_key = 1
+            for k, v in self.other_part.items():
+                for iter in re.finditer(v, text):
+                    if rs_dic[k] == '':
+                        rs_dic[k] = iter.group('main')
+                    multi_project[k] = iter.group('main')
+                    found_key = 1
+                    break
             if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
                 code_name_set.add(multi_project['project_code']+multi_project['project_name'])
                 district = getPredictor('district').get_area(
                     multi_project['project_name'] + multi_project['project_addr'], '')
-                multi_project['area'] = district['district']['area']
-                multi_project['province'] = district['district']['province']
-                multi_project['city'] = district['district']['city']
-                multi_project['district'] = district['district']['district']
+                if district['district']['province'] != '全国':
+                    multi_project['area'] = district['district']['area']
+                    multi_project['province'] = district['district']['province']
+                    multi_project['city'] = district['district']['city']
+                    multi_project['district'] = district['district']['district']
                 multi_project = {k:v for k,v in multi_project.items() if v != ''}
                 rs_l.append(multi_project)
-        if len(rs_l)>1:
+        if len(rs_l)>1 and len(set(rs_l[0].keys()))>2 and set(rs_l[0].keys())&set(rs_l[1].keys())!=set():
             return rs_l
         elif found_key == 1:
             district = getPredictor('district').get_area(
                 rs_dic['construct_company'] + rs_dic['project_name'] + rs_dic['project_addr'], '')
-            rs_dic['area'] = district['district']['area']
-            rs_dic['province'] = district['district']['province']
-            rs_dic['city'] = district['district']['city']
-            rs_dic['district'] = district['district']['district']
+            if district['district']['province'] != '全国':
+                rs_dic['area'] = district['district']['area']
+                rs_dic['province'] = district['district']['province']
+                rs_dic['city'] = district['district']['city']
+                rs_dic['district'] = district['district']['district']
             rs_dic = {k: v for k, v in rs_dic.items() if v != ''}
             return [rs_dic]
         return []

+ 0 - 0
BiddingKG/dl/role/log/ep001-loss0.961-val_loss0.612-f10.713.h5


+ 0 - 0
BiddingKG/dl/role/log/ep002-loss0.603-val_loss0.459-f10.796.h5


+ 0 - 0
BiddingKG/dl/role/log/ep003-loss0.297-val_loss0.243-f10.918.h5


+ 0 - 0
BiddingKG/dl/role/log/ep003-loss0.494-val_loss0.415-f10.812.h5


+ 0 - 0
BiddingKG/dl/role/log/ep004-loss0.280-val_loss0.231-f10.920.h5


+ 0 - 0
BiddingKG/dl/role/log/ep004-loss0.448-val_loss0.386-f10.827.h5


+ 0 - 0
BiddingKG/dl/role/log/ep005-loss0.268-val_loss0.223-f10.921.h5


+ 0 - 0
BiddingKG/dl/role/log/ep005-loss0.410-val_loss0.337-f10.864.h5


+ 0 - 0
BiddingKG/dl/role/log/ep006-loss0.260-val_loss0.215-f10.923.h5


+ 0 - 0
BiddingKG/dl/role/log/ep006-loss0.363-val_loss0.303-f10.886.h5


+ 0 - 0
BiddingKG/dl/role/log/ep007-loss0.252-val_loss0.209-f10.924.h5


+ 0 - 0
BiddingKG/dl/role/log/ep007-loss0.307-val_loss0.225-f10.924.h5


+ 0 - 0
BiddingKG/dl/role/log/ep008-loss0.101-val_loss0.175-f1_score0.942.h5


+ 0 - 0
BiddingKG/dl/role/log/ep008-loss0.118-val_loss0.171-f1_score0.937.h5


+ 0 - 0
BiddingKG/dl/role/log/ep008-loss0.243-val_loss0.204-f10.925.h5


+ 0 - 0
BiddingKG/dl/role/log/ep008-loss0.263-val_loss0.205-f10.929.h5


+ 0 - 0
BiddingKG/dl/role/log/ep009-loss0.110-val_loss0.172-f1_score0.941.h5


+ 0 - 0
BiddingKG/dl/role/log/ep009-loss0.240-val_loss0.199-f10.926.h5


+ 0 - 0
BiddingKG/dl/role/log/ep009-loss0.243-val_loss0.199-f10.930.h5


+ 0 - 0
BiddingKG/dl/role/log/ep010-loss0.234-val_loss0.196-f10.926.h5


+ 0 - 0
BiddingKG/dl/role/log/ep011-loss0.217-val_loss0.184-f10.938.h5


+ 0 - 0
BiddingKG/dl/role/log/ep011-loss0.231-val_loss0.193-f10.927.h5


+ 0 - 0
BiddingKG/dl/role/log/ep012-loss0.206-val_loss0.174-f10.939.h5


+ 0 - 0
BiddingKG/dl/role/log/ep012-loss0.227-val_loss0.190-f10.928.h5


+ 0 - 0
BiddingKG/dl/role/log/ep013-loss0.224-val_loss0.188-f10.929.h5


+ 0 - 0
BiddingKG/dl/role/log/ep014-loss0.189-val_loss0.164-f10.944.h5


+ 0 - 0
BiddingKG/dl/role/log/ep014-loss0.220-val_loss0.185-f10.930.h5


+ 0 - 0
BiddingKG/dl/role/log/ep015-loss0.180-val_loss0.161-f10.942.h5


+ 0 - 0
BiddingKG/dl/role/log/ep015-loss0.216-val_loss0.184-f10.931.h5


+ 0 - 0
BiddingKG/dl/role/log/ep016-loss0.215-val_loss0.181-f10.931.h5


+ 0 - 0
BiddingKG/dl/role/log/ep017-loss0.175-val_loss0.157-f10.945.h5


+ 0 - 0
BiddingKG/dl/role/log/ep017-loss0.210-val_loss0.180-f10.932.h5


+ 0 - 0
BiddingKG/dl/role/log/ep018-loss0.172-val_loss0.150-f10.945.h5


+ 0 - 0
BiddingKG/dl/role/log/ep018-loss0.209-val_loss0.179-f10.932.h5


+ 0 - 0
BiddingKG/dl/role/log/ep019-loss0.208-val_loss0.177-f10.933.h5


+ 0 - 0
BiddingKG/dl/role/log/ep020-loss0.206-val_loss0.176-f10.934.h5


+ 0 - 0
BiddingKG/dl/role/log/ep021-loss0.159-val_loss0.148-f10.946.h5


+ 0 - 0
BiddingKG/dl/role/log/ep021-loss0.204-val_loss0.175-f10.936.h5


+ 0 - 0
BiddingKG/dl/role/log/ep022-loss0.200-val_loss0.173-f10.937.h5


+ 0 - 0
BiddingKG/dl/role/log/ep022-loss0.579-val_loss0.548-f1_score0.805.h5


+ 0 - 0
BiddingKG/dl/role/log/ep023-loss0.199-val_loss0.173-f10.937.h5


+ 0 - 0
BiddingKG/dl/role/log/ep024-loss0.156-val_loss0.146-f10.947.h5


+ 0 - 0
BiddingKG/dl/role/log/ep024-loss0.198-val_loss0.171-f10.938.h5


+ 0 - 0
BiddingKG/dl/role/log/ep025-loss0.149-val_loss0.146-f10.948.h5


+ 0 - 0
BiddingKG/dl/role/log/ep025-loss0.197-val_loss0.170-f10.938.h5


+ 0 - 0
BiddingKG/dl/role/log/ep026-loss0.149-val_loss0.143-f10.948.h5


+ 0 - 0
BiddingKG/dl/role/log/ep026-loss0.195-val_loss0.170-f10.939.h5


+ 0 - 0
BiddingKG/dl/role/log/ep027-loss0.194-val_loss0.168-f10.940.h5


+ 0 - 0
BiddingKG/dl/role/log/ep028-loss0.192-val_loss0.168-f10.940.h5


+ 0 - 0
BiddingKG/dl/role/log/ep029-loss0.141-val_loss0.136-f10.949.h5


+ 0 - 0
BiddingKG/dl/role/log/ep029-loss0.189-val_loss0.168-f10.939.h5


+ 0 - 0
BiddingKG/dl/role/log/ep030-loss0.189-val_loss0.167-f10.940.h5


+ 0 - 0
BiddingKG/dl/role/log/ep030-loss0.504-val_loss0.477-f1_score0.838.h5


+ 0 - 0
BiddingKG/dl/role/log/ep031-loss0.187-val_loss0.166-f10.940.h5


+ 0 - 0
BiddingKG/dl/role/log/ep032-loss0.188-val_loss0.165-f10.940.h5


+ 0 - 0
BiddingKG/dl/role/log/ep033-loss0.184-val_loss0.165-f10.941.h5


+ 0 - 0
BiddingKG/dl/role/log/ep034-loss0.137-val_loss0.135-f10.950.h5


+ 0 - 0
BiddingKG/dl/role/log/ep034-loss0.184-val_loss0.165-f10.941.h5


+ 0 - 0
BiddingKG/dl/role/log/ep035-loss0.135-val_loss0.135-f10.950.h5


+ 0 - 0
BiddingKG/dl/role/log/ep035-loss0.183-val_loss0.164-f10.941.h5


+ 0 - 0
BiddingKG/dl/role/log/ep036-loss0.181-val_loss0.164-f10.941.h5


+ 0 - 0
BiddingKG/dl/role/log/ep037-loss0.182-val_loss0.163-f10.941.h5


+ 0 - 0
BiddingKG/dl/role/log/ep038-loss0.134-val_loss0.132-f10.952.h5


+ 0 - 0
BiddingKG/dl/role/log/ep038-loss0.179-val_loss0.162-f10.942.h5


+ 0 - 0
BiddingKG/dl/role/log/ep039-loss0.120-val_loss0.196-f1_score0.933.h5


+ 0 - 0
BiddingKG/dl/role/log/ep041-loss0.128-val_loss0.128-f10.953.h5


+ 0 - 0
BiddingKG/dl/role/log/ep041-loss0.179-val_loss0.162-f10.942.h5


+ 0 - 0
BiddingKG/dl/role/log/ep042-loss0.176-val_loss0.161-f10.942.h5


+ 0 - 0
BiddingKG/dl/role/log/ep043-loss0.158-val_loss0.207-f1_score0.925.h5


+ 0 - 0
BiddingKG/dl/role/log/ep044-loss0.175-val_loss0.161-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep045-loss0.175-val_loss0.161-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep046-loss0.175-val_loss0.160-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep047-loss0.174-val_loss0.160-f10.942.h5


+ 0 - 0
BiddingKG/dl/role/log/ep048-loss0.122-val_loss0.125-f10.954.h5


+ 0 - 0
BiddingKG/dl/role/log/ep048-loss0.126-val_loss0.123-f10.955.h5


+ 0 - 0
BiddingKG/dl/role/log/ep048-loss0.171-val_loss0.160-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep049-loss0.171-val_loss0.159-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep051-loss0.170-val_loss0.158-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep053-loss0.114-val_loss0.136-f10.952.h5


+ 0 - 0
BiddingKG/dl/role/log/ep054-loss0.168-val_loss0.158-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep055-loss0.168-val_loss0.158-f10.944.h5


+ 0 - 0
BiddingKG/dl/role/log/ep056-loss0.104-val_loss0.174-f1_score0.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep057-loss0.165-val_loss0.157-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep060-loss0.164-val_loss0.157-f10.944.h5


+ 0 - 0
BiddingKG/dl/role/log/ep061-loss0.165-val_loss0.157-f10.944.h5


+ 0 - 0
BiddingKG/dl/role/log/ep062-loss0.111-val_loss0.123-f10.955.h5


+ 0 - 0
BiddingKG/dl/role/log/ep062-loss0.163-val_loss0.157-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep064-loss0.162-val_loss0.156-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep064-loss0.585-val_loss0.634-f1_score0.927.h5


+ 0 - 0
BiddingKG/dl/role/log/ep065-loss0.162-val_loss0.156-f10.943.h5


+ 0 - 0
BiddingKG/dl/role/log/ep066-loss0.162-val_loss0.156-f10.944.h5


+ 0 - 0
BiddingKG/dl/role/log/ep067-loss0.160-val_loss0.156-f10.944.h5


+ 0 - 0
BiddingKG/dl/role/log/ep068-loss0.161-val_loss0.156-f10.944.h5


Some files were not shown because too many files changed in this diff