Bläddra i källkod

优化评审专家、服务期限、包号及产品

lsm 2 år sedan
förälder
incheckning
dab45a5d74

+ 16 - 6
BiddingKG/dl/common/Utils.py

@@ -797,23 +797,28 @@ def uniform_package_name(package_name):
     '''
     '''
     package_name_raw = package_name
     package_name_raw = package_name
     package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name)
     package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name)
+    package_name = package_name.replace('标段(包)', '标段').replace('№', '')
+    package_name = re.sub('\[|【', '', package_name)
     kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name)
     kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name)
     name = ""
     name = ""
     if kw:
     if kw:
         name += kw.group(0)
         name += kw.group(0)
     if re.search('^[a-zA-Z0-9-]{5,}$', package_name):   # 五个字符以上编号
     if re.search('^[a-zA-Z0-9-]{5,}$', package_name):   # 五个字符以上编号
         _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
         _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
+        # print('规范化包号1', _digit)
         name += _digit
         name += _digit
     elif re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
     elif re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
         ser = re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)
         ser = re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)
+        # print('规范化包号2', ser.group(0))
         _char = ser.groupdict().get('eng')
         _char = ser.groupdict().get('eng')
         if _char:
         if _char:
             _char = _char.upper()
             _char = _char.upper()
         _digit = ser.groupdict().get('num')
         _digit = ser.groupdict().get('num')
         _digit = uniform_num(_digit)
         _digit = uniform_num(_digit)
         name += _char.upper() + _digit
         name += _char.upper() + _digit
-    elif re.search('第?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|([分子]?[包标]))', package_name): # 处理类似 A包2标段
-        ser = re.search('第?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|([分子]?[包标]))', package_name)
+    elif re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name): # 处理类似 A包2标段
+        ser = re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name)
+        # print('规范化包号3', ser.group(0))
         _char = ser.groupdict().get('eng')
         _char = ser.groupdict().get('eng')
         if _char:
         if _char:
             _char = _char.upper()
             _char = _char.upper()
@@ -822,8 +827,9 @@ def uniform_package_name(package_name):
         if _char:
         if _char:
             name += _char.upper()
             name += _char.upper()
         name += _digit
         name += _digit
-    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name):  # 数字的统一的阿拉伯数字
-        ser = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
+    elif re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name):  # 数字的统一的阿拉伯数字
+        ser = re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
+        # print('规范化包号4', ser.group(0))
         _char = ser.groupdict().get('eng')
         _char = ser.groupdict().get('eng')
         if _char:
         if _char:
             _char = _char.upper()
             _char = _char.upper()
@@ -832,18 +838,22 @@ def uniform_package_name(package_name):
         if _char:
         if _char:
             name += _char.upper()
             name += _char.upper()
         name += _digit
         name += _digit
-    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})', package_name):  # 数字的统一的阿拉伯数字
-        _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})', package_name).group('eng').upper()
+    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z-]{1,5})', package_name):  # 数字的统一的阿拉伯数字
+        _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z-]{1,5})', package_name).group('eng').upper()
+        # print('规范化包号5', _digit)
         name += _digit
         name += _digit
     elif re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name):  # 数字的统一的阿拉伯数字
     elif re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name):  # 数字的统一的阿拉伯数字
         _digit = re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper()
         _digit = re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper()
+        # print('规范化包号6', _digit)
         name += _digit
         name += _digit
     elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name):  # 数字的统一的阿拉伯数字
     elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name):  # 数字的统一的阿拉伯数字
         _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
         _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
+        # print('规范化包号7', _digit)
         _digit = uniform_num(_digit)
         _digit = uniform_num(_digit)
         name += _digit
         name += _digit
     elif re.search('^[a-zA-Z0-9-]+$', package_name):
     elif re.search('^[a-zA-Z0-9-]+$', package_name):
         _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0)
         _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0)
+        # print('规范化包号8', _char)
         name += _char.upper()
         name += _char.upper()
     if name == "":
     if name == "":
         return package_name_raw
         return package_name_raw

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -257,7 +257,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-03-10'}
+    version_date = {'version_date': '2023-04-07'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
     data_res["nlp_enterprise"] = nlp_enterprise

+ 24 - 2
BiddingKG/dl/interface/getAttributes.py

@@ -807,8 +807,18 @@ def getPackagesFromArticle(list_sentence, list_entity):
     PackageSet = set()
     PackageSet = set()
     dict_packageCode = dict()
     dict_packageCode = dict()
 
 
-    package_number_pattern =  re.compile(
-    '((施工|监理|监测|勘察|设计|劳务)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标段?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]')  # 标号
+    # package_number_pattern =  re.compile(
+    # '((施工|监理|监测|勘察|设计|劳务)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标[段包]?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]')  # 标号
+
+    package_number_pattern = re.compile(
+        '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
+|(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)[分子]?(标[段包项]?|合同[包段]))\
+|(([,;。、:(]|第)?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
+|((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*))\
+|[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]+)\
+|((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)\
+|[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)')
+
     other_package_pattern = re.compile(
     other_package_pattern = re.compile(
         '((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]')  # # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
         '((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]')  # # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
     win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]')  # 2020/11/23 大网站规则 调整
     win_tenderer_pattern = re.compile('(中标候?选?人|供应商)(名称)?[::](.{2,25})[,。]')  # 2020/11/23 大网站规则 调整
@@ -873,6 +883,15 @@ def getPackagesFromArticle(list_sentence, list_entity):
             PackageList_item = []
             PackageList_item = []
             PackageList_item_scope = []
             PackageList_item_scope = []
             content = list_sentence[i].sentence_text
             content = list_sentence[i].sentence_text
+
+            content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')')
+            # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10
+            content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content)
+
+            for it in re.finditer('CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
+|标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\
+|\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+', content):
+                content = content.replace(it.group(0), ' ' * len(it.group(0)))
             tokens = list_sentence[i].tokens
             tokens = list_sentence[i].tokens
             _names = []
             _names = []
             for iter in re.finditer(package_number_pattern, content):
             for iter in re.finditer(package_number_pattern, content):
@@ -891,6 +910,9 @@ def getPackagesFromArticle(list_sentence, list_entity):
                 elif re.search('同一(标段?|包)', content[max(0, iter.start()-2):iter.end()]):  # 不得参加同一标段
                 elif re.search('同一(标段?|包)', content[max(0, iter.start()-2):iter.end()]):  # 不得参加同一标段
                     # print('过滤掉错误包:', iter.group())
                     # print('过滤掉错误包:', iter.group())
                     continue
                     continue
+                elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
+                    # print('过滤掉错误包号5:', iter.group(0))
+                    continue
                 temp_package_number = uniform_package_name(iter.group(0))
                 temp_package_number = uniform_package_name(iter.group(0))
                 True_package.add(temp_package_number)
                 True_package.add(temp_package_number)
                 PackageList_item.append({"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index,
                 PackageList_item.append({"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index,

+ 1 - 1
BiddingKG/dl/interface/predictor.py

@@ -3641,7 +3641,7 @@ class DocChannel():
           if '采购意向' in life_kw_title or '采购意向' in life_list:
           if '采购意向' in life_kw_title or '采购意向' in life_list:
               return '采购意向', msc
               return '采购意向', msc
           elif '招标预告' in life_kw_title or '招标预告' in life_list:
           elif '招标预告' in life_kw_title or '招标预告' in life_list:
-              if '中标信息' in life_kw_title:
+              if '中标信息' in life_kw_title or '中标信息' in life_list:
                   return '中标信息', msc
                   return '中标信息', msc
               elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
               elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
                   return '', msc
                   return '', msc

BIN
BiddingKG/dl/interface/product_savedmodel/productAndfailreason.pb


+ 294 - 51
BiddingKG/dl/product/data_tfrecord.py

@@ -10,6 +10,21 @@ import os
 import re
 import re
 import collections
 import collections
 from BiddingKG.dl.product.data_util import word2id, max_id
 from BiddingKG.dl.product.data_util import word2id, max_id
+import psycopg2
+import json
+import pickle
+
+conn = psycopg2.connect(host='192.168.2.103', port='5432', user='postgres', password='postgres', dbname='bid_validate')
+cursor = conn.cursor()
+def get_title(docid):
+    sql = "select doctitle from qiao_ke_bao_raw where docid='{0}'".format(docid)
+    cursor.execute(sql)
+    for row in cursor.fetchall():
+        return row[0]
+    return ''
+
+product_notin = []
+
 max_len = 500
 max_len = 500
 
 
 def create_int_feature(values):
 def create_int_feature(values):
@@ -61,8 +76,11 @@ def fix_label_ner_句号分开(sentence, product_list, reasons_list):
 
 
 
 
 def create_instances_from_document_句号分开(docid, document_text, product_list, reasons_list):
 def create_instances_from_document_句号分开(docid, document_text, product_list, reasons_list):
-    for it in ['一','二','三','四','五','六','七','八','九','十','十一','十二','十三','十四','十五']:
-        document_text = document_text.replace(',%s、'%it, '。%s、'%it)
+    # for it in ['一','二','三','四','五','六','七','八','九','十','十一','十二','十三','十四','十五']:
+    #     document_text = document_text.replace(',%s、'%it, '。%s、'%it)
+    for it in re.finditer('[^\w\d。][一二三四五六七八九十]{1,3}、', document_text):
+        t = it.group(0)
+        document_text = document_text.replace(t, '。' + t[1:])
 
 
     if docid in ['docid']:
     if docid in ['docid']:
         pass
         pass
@@ -137,6 +155,10 @@ def fix_label_ner(sentence, product_list, reasons_list):
     tag_list = ['S'] * len(sentence)
     tag_list = ['S'] * len(sentence)
     word_list = list(sentence)
     word_list = list(sentence)
     for product in product_list:
     for product in product_list:
+        if len(re.sub('[^\w]', '', product))<1:
+            print('错误产品: ', product)
+            continue
+
         b = sentence.find(product)
         b = sentence.find(product)
         while b != -1:
         while b != -1:
             e = b + len(product)
             e = b + len(product)
@@ -158,10 +180,97 @@ def fix_label_ner(sentence, product_list, reasons_list):
             b = sentence.find(reason, e)
             b = sentence.find(reason, e)
     return tag_list, word_list
     return tag_list, word_list
 
 
+def fix_label_ner_remove_punctuation(sentence, product_list, reasons_list):
+    tag_list = ['S'] * len(sentence)
+    word_list = list(sentence)
+    if len(product_list)>0:
+        for it in re.finditer('|'.join(product_list), sentence):
+            b, e = it.span()
+            if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
+                tag_list[b] = 'B-pro'
+                tag_list[e - 1] = 'E-pro'
+                for i in range(b + 1, e - 1):
+                    tag_list[i] = 'I-pro'
+
+    for reason in reasons_list:
+        b = sentence.find(reason)
+        while b != -1:
+            e = b + len(reason)
+            if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
+                tag_list[b] = 'B-rea'
+                tag_list[e - 1] = 'E-rea'
+                for i in range(b + 1, e - 1):
+                    tag_list[i] = 'I-rea'
+            b = sentence.find(reason, e)
+    return tag_list, word_list
+
+def create_instances_from_document_remove_punctuation(docid, document_text, product_list, reasons_list):
+    product_list = set([re.sub('[^\w]', '', it) for it in product_list if len(re.sub('[^\w]', '', it))>1])  # 产品字段去掉符号
+    reasons_list = set([re.sub('[^\w]', '', it) for it in reasons_list if len(re.sub('[^\w]', '', it))>1])
+    document_text = re.sub('[^\w]', '', document_text)
+
+    product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
+    reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
+    kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?(原因|理由)', document_text)
+    if reasons_list == [] and kw_re:
+        document_text = re.sub('(流标|废标|终止|中止|失败|异常)的?(原因|理由).{, 30}', '', document_text)
+
+    pos = []
+    neg = []
+    if len(document_text)<= max_len:
+        document_text = document_text[:max_len]
+        tag_list, word_list = fix_label_ner_remove_punctuation(document_text, product_list, reasons_list)
+        if len(reasons_list)>0 and 'B-rea' not in tag_list:
+            print("少于%d字的文章废标原因标注未找到:%s"%(max_len, docid))
+        instance = TrainingInstance(word_list, tag_list)
+        if 'B-pro' in tag_list or 'E-rea' in tag_list:
+            pos.append(instance)
+        else:
+            neg.append(instance)
+    elif len(reasons_list)>0:
+        b = document_text.find(reasons_list[0])
+        if b != -1:
+            document_text = document_text[max(0, b-8):][:max_len]
+        else:
+            document_text = document_text[:max_len]
+            print("多于%d字的文章废标原因标注未找到:%s," % (max_len, docid))
+        tag_list, word_list = fix_label_ner_remove_punctuation(document_text, product_list, reasons_list)
+        if 'E-rea' not in tag_list:
+            print("文章废标原因标注未找到:%s, 开始位置:%d"%(docid, b))
+        instance = TrainingInstance(word_list, tag_list)
+        if 'B-pro' in tag_list or 'B-rea' in tag_list:
+            pos.append(instance)
+        else:
+            neg.append(instance)
+    else:
+        epoch = len(document_text)//max_len
+        if len(document_text)%max_len > 50:
+            epoch += 1
+        for i in range(epoch):
+            sentence = document_text[i*max_len: (i+1)*max_len]
+            if len(sentence)<5:
+                # print("句子长度小于5")
+                # print(sentence)
+                continue
+            sentence = sentence[:max_len]
+            tag_list, word_list = fix_label_ner_remove_punctuation(sentence, product_list, reasons_list)
+            instance = TrainingInstance(word_list, tag_list)
+            if 'B-pro' in tag_list or 'B-rea' in tag_list:
+                pos.append(instance)
+            else:
+                neg.append(instance)
+    random.shuffle(neg)
+    # neg = neg[:min(5, 10*len(pos))]
+    neg = neg[:min(5, 2*len(pos))]
+    instances = pos+neg
+    random.shuffle(instances)
+    return instances
+
 def create_instances_from_document(docid, document_text, product_list, reasons_list):
 def create_instances_from_document(docid, document_text, product_list, reasons_list):
     product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
     product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
     reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
     reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
     kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?原因', document_text)
     kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?原因', document_text)
+
     if reasons_list == [] and kw_re:
     if reasons_list == [] and kw_re:
         kw = kw_re.group(0)
         kw = kw_re.group(0)
         idx = document_text.find(kw)
         idx = document_text.find(kw)
@@ -196,10 +305,13 @@ def create_instances_from_document(docid, document_text, product_list, reasons_l
         else:
         else:
             neg.append(instance)
             neg.append(instance)
     else:
     else:
-        for it in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '十三', '十四', '十五']:
-            document_text = document_text.replace(',%s、' % it, '。%s、' % it)
+        # for it in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '十三', '十四', '十五']:
+        #     document_text = document_text.replace(',%s、' % it, '。%s、' % it)
+        for it in re.finditer('[^\w\d][一二三四五六七八九十]{1,3}、', document_text):
+            t = it.group(0)
+            document_text = document_text.replace(t, '。' + t[1:])
         for sentence in document_text.split('。'):
         for sentence in document_text.split('。'):
-            if len(sentence)<2:
+            if len(sentence)<5:
                 # print("句子长度小于5")
                 # print("句子长度小于5")
                 # print(sentence)
                 # print(sentence)
                 continue
                 continue
@@ -249,7 +361,8 @@ def create_instances_from_document(docid, document_text, product_list, reasons_l
                 else:
                 else:
                     neg.append(instance)
                     neg.append(instance)
     random.shuffle(neg)
     random.shuffle(neg)
-    neg = neg[:min(5, 10*len(pos))]
+    # neg = neg[:min(5, 10*len(pos))]
+    neg = neg[:min(5, 2*len(pos))]
     instances = pos+neg
     instances = pos+neg
     random.shuffle(instances)
     random.shuffle(instances)
     return instances
     return instances
@@ -259,37 +372,92 @@ def create_training_instances(df):
     # df = pd.read_excel(xlsx)
     # df = pd.read_excel(xlsx)
     df.fillna('', inplace=True)
     df.fillna('', inplace=True)
     for i in df.index:
     for i in df.index:
-        try:
-            docid = df.loc[i, 'docid']
-            document_text = df.loc[i, 'text']
-            product_list = json.loads(df.loc[i, 'lbset'])
-            reasons_list = json.loads(df.loc[i, 'reasons_list'])
-            # if reasons_list == []:
-            #     continue
-            instances.extend(
-                create_instances_from_document(
-                    docid, document_text, product_list, reasons_list
-                ))
-        except Exception as e:
-            print('json出错',i,  df.loc[i, 'lbset'], type(df.loc[i, 'lbset']), e)
+        if i % 5000==0:
+            print('create_instance', i)
+        # try:
+        docid = df.loc[i, 'docid']
+        document_text = df.loc[i, 'text']
+        product_list = json.loads(df.loc[i, 'lbset'])
+        reasons_list = json.loads(df.loc[i, 'reasons_list'])
+
+        notin_num = 0
+        for i in range(len(product_list)):  # 如果在公告找不到产品,尝试在中间加标点符号,
+            p = product_list[i]
+            if re.search('[^\w]', p) == None and re.search(p, document_text) == None:
+                ser = re.search('[^\w]{,2}'.join(p), document_text)
+                if ser:
+                    product_list[i] = ser.group(0)
+                elif '项目' in p and re.search(p.replace('项目', '采购项目'), document_text):
+                    product_list[i] = p.replace('项目', '采购项目')
+                elif '项目' in p and re.search(p.replace('项目', ''), document_text):
+                    product_list[i] = p.replace('项目', '')
+                elif re.search('[a-zA-Z]', p) and re.search(p.lower(), document_text):
+                    product_list[i] = p.lower()
+                elif re.search('[a-zA-Z]', p) and re.search(p.upper(), document_text.upper()):
+                    product_list[i] = p.upper()
+                    document_text = document_text.upper()
+                else:
+                    title = get_title(docid)
+                    if title not in document_text:
+                        document_text = title + "。" + document_text
+                        ser = re.search('[^\w]{,2}'.join(p), document_text)
+                        if ser:
+                            product_list[i] = ser.group(0)
+                        elif '项目' in p and re.search(p.replace('项目', '采购项目'), document_text):
+                            product_list[i] = p.replace('项目', '采购项目')
+                        elif '项目' in p and re.search(p.replace('项目', ''), document_text):
+                            product_list[i] = p.replace('项目', '')
+                        elif re.search('[a-zA-Z]', p) and re.search(p.lower(), document_text):
+                            product_list[i] = p.lower()
+                        elif re.search('[a-zA-Z]', p) and re.search(p.upper(), document_text.upper()):
+                            product_list[i] = p.upper()
+                            document_text = document_text.upper()
+                        else:
+                            # print('docid:%s,not in text product: %s' % (docid, p))
+                            notin_num += 1
+                            if re.search('业绩', document_text) == None:
+                                product_notin.append((docid, p))
+                    else:
+                        # print('docid:%s,not in text product: %s'%(docid, p))
+                        notin_num +=1
+                        if re.search('业绩', document_text) == None:
+                            product_notin.append((docid, p))
+        if notin_num > len(product_list)/2:
+            print('找到的产品少于一半,过滤掉', docid, product_list)
+            continue
+
+        # if reasons_list == []:
+        #     continue
+        instances.extend(
+            create_instances_from_document(
+                docid, document_text, product_list, reasons_list
+            ))
+        # instances.extend(
+        #     create_instances_from_document_remove_punctuation(
+        #         docid, document_text, product_list, reasons_list
+        #     ))
+        # except Exception as e:
+        #     print('json出错',i,  df.loc[i, 'lbset'], type(df.loc[i, 'lbset']), e)
     return instances
     return instances
 
 
-def write_instance_to_example_files(instances, word2index, tag2index, output_dir):
+def write_instance_to_example_files(instances, word2index, tag2index, output_dir, tfrecode_name):
     # writers = []
     # writers = []
     # instances = sorted(instances, key=lambda x: len(x.word_list))
     # instances = sorted(instances, key=lambda x: len(x.word_list))
     i = 0
     i = 0
     # for max_len in [200, 500, 1000]:
     # for max_len in [200, 500, 1000]:
-    writer = tf.python_io.TFRecordWriter(output_dir + '/maxlen_%s_addunk_product_reason.tfrecode'%max_len)
+    # writer = tf.python_io.TFRecordWriter(output_dir + '/maxlen_%s_addunk_product_reason.tfrecode'%max_len)
+    writer = tf.python_io.TFRecordWriter(output_dir + '/%s'%tfrecode_name)
     # print('排序前:', [len(x.word_list) for x in instances[:5]])
     # print('排序前:', [len(x.word_list) for x in instances[:5]])
     # instances.sort(key=lambda x:len(x.word_list), reverse=True)
     # instances.sort(key=lambda x:len(x.word_list), reverse=True)
     # print('排序后:', [len(x.word_list) for x in instances[:5]])
     # print('排序后:', [len(x.word_list) for x in instances[:5]])
     while i < len(instances):
     while i < len(instances):
+        if i % 5000 == 0:
+            print('开始写入', i)
         instance = instances[i]
         instance = instances[i]
         if len(instance.word_list)>max_len:
         if len(instance.word_list)>max_len:
             writer.close()
             writer.close()
             break
             break
         i += 1
         i += 1
-        # word_ids = [word2index.get(word, max_id) for word in instance.word_list]
         word_ids = [word2index.get(word, word2index.get('<unk>')) for word in instance.word_list]
         word_ids = [word2index.get(word, word2index.get('<unk>')) for word in instance.word_list]
         tag_ids = [tag2index.get(tag, 0) for tag in instance.tag_list]
         tag_ids = [tag2index.get(tag, 0) for tag in instance.tag_list]
         while len(word_ids)<max_len:
         while len(word_ids)<max_len:
@@ -303,42 +471,117 @@ def write_instance_to_example_files(instances, word2index, tag2index, output_dir
         writer.write(tf_example.SerializeToString())
         writer.write(tf_example.SerializeToString())
     writer.close()
     writer.close()
 
 
+def 去除标注不在公告里面的公告(df):
+    df['notin'] = df.apply(
+        lambda x: json.dumps([it for it in json.loads(x['lbset']) if re.sub('[^\w]', '', it) not in re.sub('[^\w]', '', x['text'])],
+                             ensure_ascii=False), axis=1)
+    df = df[df['notin']=='[]']
+    return df
+
+
 if __name__ == "__main__":
 if __name__ == "__main__":
-    df = pd.read_excel(os.path.dirname(__file__) + '/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
-    df['pos'] = df.apply(lambda x:1 if re.search('(流标|废标|终止|中止|失败|异常)(公告|公示)', x['text']) and x['reasons_list']=='[]' else 0, axis=1)
-    df = df[df.loc[:, 'pos']==0]  # 过滤掉未标注废标原因文章
-    df.reset_index(drop=True, inplace=True)
-    print('总文章数:',len(df))
-    df.fillna('', inplace=True)
-    print('读取完毕')
-    df['lbs'] = df['lbset'].apply(lambda x: json.loads(x))
-    lbset = [it for l in df['lbs'] for it in l]
-    c = collections.Counter(lbset)
-    m = c.most_common()
-    m3 = [it[0] for it in m if it[1] > 2]
-    df['pos'] = df['lbs'].apply(lambda x: 1 if len(set(m3) & set(x)) >= 1 else 0)
-    df_dev = df[df.loc[:, 'pos'] == 1].sample(frac=0.1, random_state=8)
-    print('len_df_dev:', len(df_dev))
-    df_reason = df[df.loc[:, 'reasons_list'] != '[]'].sample(frac=0.1, random_state=8)
-    print('len(df_reason)', len(df_reason))
-    df_dev.append(df_reason)
-    df_dev.drop_duplicates(subset=['docid'], inplace=True)
-    print('len_df_dev:', len(df_dev))
-    df_train = df[~df.index.isin(df_dev.index)]
-    print(len(df), len(df_dev), len(df_train))
-    df_train = df_train.sample(frac=1)
-    df_dev = df_dev.sample(frac=1)
-
-    # file = 'data/traindata.xlsx'
+    # df = pd.read_excel(os.path.dirname(__file__) + '/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
+
+    # df = pd.read_excel('E:/产品及失败原因标注数据/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
+    # # tfrecode_name = '20211125_ProductAndReason.tfrecode'
+    # df = df[['docid', 'text', 'lbset', 'reasons_list']]
+    #
+    # df1 = pd.read_excel('E:/产品及失败原因标注数据/桥客宝产品数据1.xlsx')
+    #
+    # # tfrecode_name = 'qiaokebao1_product.tfrecode'
+    # df2 = pd.read_csv('E:/产品及失败原因标注数据/桥客宝产品数据2.csv')
+    #
+    # # tfrecode_name = 'qiaokebao2_product.tfrecode'
+    # df3 = pd.read_csv('E:/产品及失败原因标注数据/桥客宝产品数据3.csv')
+    # df = df.append([df1, df2, df3], ignore_index=True)
+    #
+    # tfrecode_name = 'all_product.tfrecode'
+    #
+    # df = df[['docid', 'text', 'lbset', 'reasons_list']]
+    # df.fillna('', inplace=True)
+    # df['pos'] = df.apply(lambda x:1 if re.search('(流标|废标|终止|中止|失败|异常)(公告|公示)', x['text']) and x['reasons_list']=='[]' else 0, axis=1)
+    # df = df[df.loc[:, 'pos']==0]  # 过滤掉未标注废标原因文章
+    # df.reset_index(drop=True, inplace=True)
+    # print('总文章数:',len(df))
+    # df.fillna('', inplace=True)
+    # print('读取完毕')
+    # df['lbs'] = df['lbset'].apply(lambda x: json.loads(x))
+    # lbset = [it for l in df['lbs'] for it in l]
+    # c = collections.Counter(lbset)
+    # m = c.most_common()
+    # m3 = [it[0] for it in m if it[1] > 2]
+    # print('m3: ', m3[:20])
+    # df['pos'] = df['lbs'].apply(lambda x: 1 if len(set(m3) & set(x)) >= 1 else 0)
+    # print('sum(pos): ', sum(df['pos']))
+    # df_dev = df[df.loc[:, 'pos'] == 1].sample(frac=0.1, random_state=8)
+    # print('len_df_dev:', len(df_dev))
+    #
+    # df_reason = df[df.loc[:, 'reasons_list'] != '[]']
+    # if len(df_reason)>10:
+    #     df_reason = df_reason.sample(frac=0.1, random_state=8)
+    #     print('len(df_reason)', len(df_reason))
+    #     df_dev.append(df_reason)
+    # df_dev.drop_duplicates(subset=['docid'], inplace=True)
+    # print('len_df_dev:', len(df_dev))
+    #
+    # df_train = df[~df.index.isin(df_dev.index)]
+    # print(len(df), len(df_dev), len(df_train))
+    # df_train = df_train.sample(frac=1)
+    # df_dev = df_dev.sample(frac=1)
+
+
+    df_train = pd.read_csv('E:/产品及失败原因标注数据/df_train.csv')
+    print('读取完毕',len(df_train))
+    sp = len(df_train)//2
+    df_train = df_train[:sp]
+    tfrecode_name = 'ProductAndReason_2023-02-24_train1.tfrecode'
+    # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_train1.tfrecode'
     instances = create_training_instances(df_train)
     instances = create_training_instances(df_train)
+    del df_train
     # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
     # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
     tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
     tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
     output_dir = 'data/train_data'
     output_dir = 'data/train_data'
-    write_instance_to_example_files(instances, word2id, tag2index, output_dir)
+    print('准备写入')
+    write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
+    print('完成1')
+    with open('E:\产品及失败原因标注数据/product_notin1.pkl', 'wb') as f:
+        pickle.dump(product_notin, f)
 
 
+    df_train = pd.read_csv('E:/产品及失败原因标注数据/df_train.csv')
+    print('读取完毕', len(df_train))
+    sp = len(df_train)//2
+    df_train = df_train[sp:]
+    tfrecode_name = 'ProductAndReason_2023-02-24_train2.tfrecode'
+    # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_train2.tfrecode'  # 去掉文本及产品里面的符号
+    instances = create_training_instances(df_train)
+    del df_train
+    # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
+    tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
+    output_dir = 'data/train_data'
+    print('准备写入')
+    write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
+    print('完成2')
+    with open('E:\产品及失败原因标注数据/product_notin2.pkl', 'wb') as f:
+        pickle.dump(product_notin, f)
+
+    df_dev = pd.read_csv('E:/产品及失败原因标注数据/df_dev.csv')
+
+    print('去除前', len(df_dev))
+    # df_dev = 去除标注不在公告里面的公告(df_dev)
+    # print('去除后', len(df_dev))
+    #
+    tfrecode_name = 'ProductAndReason_2023-02-24_dev.tfrecode'
+    # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_dev.tfrecode'
     instances = create_training_instances(df_dev)
     instances = create_training_instances(df_dev)
+    del df_dev
     # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
     # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
     tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
     tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
     output_dir = 'data/test_data'
     output_dir = 'data/test_data'
-    write_instance_to_example_files(instances, word2id, tag2index, output_dir)
-    print('全部写入成功!')
+    write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
+    print('全部写入成功!')
+    with open('E:\产品及失败原因标注数据/product_notin3.pkl', 'wb') as f:
+        pickle.dump(product_notin, f)
+
+
+cursor.close()
+conn.close()

+ 11 - 23
BiddingKG/dl/product/data_util.py

@@ -11,28 +11,16 @@ import numpy as np
 import pandas as pd
 import pandas as pd
 from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word,viterbi_decode, load
 from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word,viterbi_decode, load
 
 
-tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
-id_to_tag = {v:k for k,v in tag2index.items()}
-# id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
+tag2id = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
+id_to_tag = {v:k for k,v in tag2id.items()}
 
 
-word_model = getModel_word()
-vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
-word2id = {k: v for v, k in enumerate(vocab)}
+path1 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_vocab.pk"
+path2 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_w2v_matrix.pk"
+vocab = load(path1)
+matrix = load(path2)
 max_id = len(vocab)
 max_id = len(vocab)
+word2id = {k: v for v, k in enumerate(vocab)}
 
 
-# path1 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_vocab.pk"
-# path2 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_w2v_matrix.pk"
-# vocab = load(path1)
-# matrix = load(path2)
-# max_id = len(vocab)
-# word2id = {k: v for v, k in enumerate(vocab)}
-
-# vocab = ["<pad>"] + word_model.index2word+ ["<unk>"]
-# matrix = np.zeros((len(vocab), 60))
-# for i in range(1, len(vocab)-1):
-#     matrix[i] = word_model[vocab[i]]
-# max_id = len(vocab)
-# word2id = {k: v for v, k in enumerate(vocab)}
 
 
 def df2data(df):
 def df2data(df):
     import pandas as pd
     import pandas as pd
@@ -211,8 +199,8 @@ def process_data(sentences):
     :return: 数字化后的统一长度
     :return: 数字化后的统一长度
     '''
     '''
     maxLen = max([len(sentence) for sentence in sentences])
     maxLen = max([len(sentence) for sentence in sentences])
-    tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
-    # tags = [[word2id.get(k, word2id.get('<unk>')) for k in sentence] for sentence in sentences]
+    # tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
+    tags = [[word2id.get(k, word2id.get('<unk>')) for k in sentence] for sentence in sentences]
     pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
     pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
     return pad_tags
     return pad_tags
 
 
@@ -225,8 +213,8 @@ def get_ner(BIE_tag):
 def decode(logits, lengths, matrix):
 def decode(logits, lengths, matrix):
     paths = []
     paths = []
     small = -1000.0
     small = -1000.0
-    start = np.asarray([[small]*4+[0]])
-    # start = np.asarray([[small]*7+[0]])
+    # start = np.asarray([[small]*4+[0]]) # 只有产品
+    start = np.asarray([[small]*7+[0]]) # 产品及失败原因
     for score, length in zip(logits, lengths):
     for score, length in zip(logits, lengths):
         score = score[:length]
         score = score[:length]
         pad = small * np.ones([length, 1])
         pad = small * np.ones([length, 1])

+ 2 - 1
BiddingKG/dl/product/main.py

@@ -116,7 +116,8 @@ def save_model_pb():
     #
     #
     # 把cpkt转为pb
     # 把cpkt转为pb
 
 
-    input_checkpoint = "model/ner_epoch5_f10.6855_loss1.3800.ckpt"
+    # input_checkpoint = "model/ner_epoch5_f10.6855_loss1.3800.ckpt"
+    input_checkpoint = "model/ner_epoch22_f10.7923_loss1.1039.ckpt" #2023/4/6
     saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True)
     saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True)
     graph = tf.get_default_graph()  # 获得默认的图
     graph = tf.get_default_graph()  # 获得默认的图
     input_graph_def = graph.as_graph_def()  # 返回一个序列号
     input_graph_def = graph.as_graph_def()  # 返回一个序列号

+ 79 - 0
BiddingKG/dl/product/predict.py

@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+@author: bidikeji
+@time: 2023/3/27 10:19
+"""
+from BiddingKG.dl.product.product_model import Product_Model
+import os
+import re
+import time
+import pandas as pd
+import tensorflow as tf
+os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+def predict():
+    ckpt_path = "model"
+    import json
+    with tf.Session() as sess:
+        model = Product_Model()
+        sess.run(tf.global_variables_initializer())
+        ckpt = tf.train.get_checkpoint_state(ckpt_path)
+
+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch16_f10.8000_loss1.0775.ckpt')
+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch7_f10.7998_loss1.0508.ckpt')
+        model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch22_f10.7923_loss1.1039.ckpt') # 整理数据后再次训练
+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch18_f10.8000_loss1.1276.ckpt') # 新
+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch5_f10.6855_loss1.3800.ckpt') # 旧
+        t1 = time.time()
+
+        print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
+        # df = pd.read_csv(os.path.dirname(__file__) + '/data/df_test.csv') #../test/
+        df = pd.read_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx')
+        print('公告数量:', len(df))
+        df.fillna('', inplace=True)
+        # df = pd.read_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
+        df.reset_index(drop=True, inplace=True)
+        rs = []
+        for i in df.index:
+            text = df.loc[i, 'text']
+            # result = model.evaluate_line(sess, text)
+            # print(result[0][1])
+            # rs.append(json.dumps(result[0][1], ensure_ascii=False))
+
+            tmp = []
+            for line in text.split('。'):
+                # line = re.sub('[^\w]', '', line)
+                # if len(line) < 5:
+                #     continue
+                result = model.evaluate_line(sess, line)
+                # print(result[0][1])
+                tmp.extend(result[0][1])
+            rs.append(json.dumps(tmp, ensure_ascii=False))
+        df['predict_new'] = pd.Series(rs)
+        df.to_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx', index=False)
+        print('耗时: ', time.time()-t1)
+        return df
+
+def 统计准确率(df):
+    import json
+    # df = pd.read_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx')
+    df['pr'] = df['predict_new'].apply(lambda x:set([it[0] for it in json.loads(x)]))
+    df['lb'] = df['lbset'].apply(lambda x: set(json.loads(x)))
+    df['pos'] = df.apply(lambda x:1 if x['pr']==x['lb'] else 0, axis=1)
+    eq = lb = pr = 0
+    for i in df.index:
+        pred = df.loc[i, 'pr']
+        label = df.loc[i, 'lb']
+        lb += len(label)
+        pr += len(pred)
+        eq += len(pred&label)
+    acc = eq/pr
+    recall = eq/lb
+    f1 = acc*recall*2/(acc+recall)
+    print('准确率:%.4f,召回率:%.4f,F1:%.4f'%(acc, recall, f1))  # 准确率:0.6489,召回率:0.8402,F1:0.7323
+    # df.to_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx')
+
+if __name__ == "__main__":
+    df = predict()
+    统计准确率(df)

+ 63 - 27
BiddingKG/dl/product/train.py

@@ -9,6 +9,9 @@
 # @Time    : 2021/1/13 0013 10:12
 # @Time    : 2021/1/13 0013 10:12
 import os
 import os
 import re
 import re
+import time
+import logging
+logging.basicConfig(level=logging.DEBUG)
 print('准备导入tf')
 print('准备导入tf')
 import tensorflow as tf
 import tensorflow as tf
 print('准备导入np')
 print('准备导入np')
@@ -19,11 +22,14 @@ print('准备导入max_len')
 from BiddingKG.dl.product.data_tfrecord import max_len
 from BiddingKG.dl.product.data_tfrecord import max_len
 # from BiddingKG.dl.common.Utils import viterbi_decode
 # from BiddingKG.dl.common.Utils import viterbi_decode
 print('准备设置CUDA环境')
 print('准备设置CUDA环境')
-os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 # max_len = 500
 # max_len = 500
+batch_size = 256
+MIN_AFTER_DEQUEUE = batch_size*500
 
 
-def read_tfRecord(sess, file_tfRecord):
-    queue = tf.train.string_input_producer([file_tfRecord])
+def read_tfRecord(sess, file_list):
+    # queue = tf.train.string_input_producer([file_tfRecord])
+    queue = tf.train.string_input_producer(file_list)
     reader = tf.TFRecordReader()
     reader = tf.TFRecordReader()
     filename_, serialized_example = reader.read(queue)
     filename_, serialized_example = reader.read(queue)
     features = tf.parse_single_example(
     features = tf.parse_single_example(
@@ -39,13 +45,13 @@ def read_tfRecord(sess, file_tfRecord):
     text_len = tf.cast(features['text_len'], tf.int64)
     text_len = tf.cast(features['text_len'], tf.int64)
     return text_len, word_ids , tag_ids
     return text_len, word_ids , tag_ids
 
 
-def get_batch_record(sess,filename, batch_size):
-    text_len, word_ids, tag_ids = read_tfRecord(sess, filename)
+def get_batch_record(sess,file_list, batch_size):
+    text_len, word_ids, tag_ids = read_tfRecord(sess, file_list)
     text_len, word_ids, tag_ids = tf.train.shuffle_batch([text_len, word_ids , tag_ids],
     text_len, word_ids, tag_ids = tf.train.shuffle_batch([text_len, word_ids , tag_ids],
                                                      batch_size=batch_size,
                                                      batch_size=batch_size,
-                                                     capacity=200+batch_size*3,
-                                                     min_after_dequeue=1,
-                                                     num_threads=5)
+                                                     capacity=MIN_AFTER_DEQUEUE+batch_size*3,
+                                                     min_after_dequeue=MIN_AFTER_DEQUEUE,
+                                                     num_threads=8)
     text_len = tf.squeeze(text_len, squeeze_dims=1)
     text_len = tf.squeeze(text_len, squeeze_dims=1)
     return text_len, word_ids , tag_ids
     return text_len, word_ids , tag_ids
 
 
@@ -60,14 +66,36 @@ def total_sample(file_name):
         sample_num += 1
         sample_num += 1
     return sample_num
     return sample_num
 
 
-if __name__ == "__main__":
-    print('进入main ')
-    filename = os.path.dirname(__file__)+'/data/train_data/maxlen_500_addunk_product_reason.tfrecode'
-    filename_dev = os.path.dirname(__file__)+'/data/test_data/maxlen_500_addunk_product_reason.tfrecode'
-    assert os.path.exists(filename)
-    assert os.path.exists(filename_dev)
+def train():
+    logging.info('进入main ')
+    # filename = os.path.dirname(__file__)+'/data/train_data/maxlen_500_addunk_product_reason.tfrecode'
+    # filename_dev = os.path.dirname(__file__)+'/data/test_data/maxlen_500_addunk_product_reason.tfrecode'
+    # print('os.path.dirname(__file__): ', os.path.dirname(__file__))
+    # print('filename path :', filename)
+    # assert os.path.exists(filename)
+    # assert os.path.exists(filename_dev)
+
+    file_list = []
+    file_list_dev = []
+    train1 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-02-24_train1.tfrecode'
+    train2 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-02-24_train2.tfrecode'
+    dev1 = os.path.dirname(__file__)+'/data/test_data/ProductAndReason_2023-02-24_dev.tfrecode'
+
+    # train1 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-03-30_remove_punctuation_train1.tfrecode'
+    # train2 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-03-30_remove_punctuation_train2.tfrecode'
+    # dev1 = os.path.dirname(__file__)+'/data/test_data/ProductAndReason_2023-03-30_remove_punctuation_dev.tfrecode'
+
+    # print('filename path :', train1, os.path.exists(train1))
+
+    file_list.append(train1)
+    file_list.append(train2)
+
+    file_list_dev.append(dev1)
+
+
     print('确保文件存在')
     print('确保文件存在')
-    batch_size = 100
+    print('filename path :', train1, os.path.exists(train1))
+    # batch_size = 512
     # id_to_tag = {0: 'O', 1: 'B', 2: 'I', 3: 'E'}
     # id_to_tag = {0: 'O', 1: 'B', 2: 'I', 3: 'E'}
     tag2index = {'S': 0, 'B-pro': 1, 'I-pro': 2, 'E-pro': 3, 'B-rea': 4, 'I-rea': 5, 'E-rea': 6}
     tag2index = {'S': 0, 'B-pro': 1, 'I-pro': 2, 'E-pro': 3, 'B-rea': 4, 'I-rea': 5, 'E-rea': 6}
     id_to_tag = {v:k for k,v in tag2index.items()}
     id_to_tag = {v:k for k,v in tag2index.items()}
@@ -88,25 +116,28 @@ if __name__ == "__main__":
         init_op = tf.global_variables_initializer()
         init_op = tf.global_variables_initializer()
         sess.run(init_op)
         sess.run(init_op)
         print('参数初始化')
         print('参数初始化')
-        text_len, word_ids, tag_ids = get_batch_record(sess, filename, batch_size=batch_size)
+        text_len, word_ids, tag_ids = get_batch_record(sess, file_list, batch_size=batch_size)
         print('get_batch_record')
         print('get_batch_record')
-        text_len_dev, word_ids_dev, tag_ids_dev = get_batch_record(sess, filename_dev, batch_size=batch_size)
+        text_len_dev, word_ids_dev, tag_ids_dev = get_batch_record(sess, file_list_dev, batch_size=batch_size)
         print('get_batch_record_dev')
         print('get_batch_record_dev')
         coord = tf.train.Coordinator()
         coord = tf.train.Coordinator()
         threads = tf.train.start_queue_runners(coord=coord)
         threads = tf.train.start_queue_runners(coord=coord)
-        print('total_sample(filename)', total_sample(filename))
 
 
-        total_num = total_sample(filename)
+        total_num = sum([total_sample(filename) for filename in file_list])
+        logging.info('total_train_num: %d'%total_num)
         batch_num = total_num//batch_size
         batch_num = total_num//batch_size
-        batch_num_dev = total_sample(filename_dev)//batch_size
+        batch_num_dev = sum([total_sample(filename_dev) for filename_dev in file_list_dev])//batch_size
         num = 0
         num = 0
         l = []
         l = []
-        max_f1 = 0
 
 
+        max_f1 = 0.79
         # model.saver.restore(sess, os.path.join(os.path.dirname(__file__)+'/model','ner_epoch10_f10.6875_loss1.5230.ckpt'))
         # model.saver.restore(sess, os.path.join(os.path.dirname(__file__)+'/model','ner_epoch10_f10.6875_loss1.5230.ckpt'))
-        # print('模型加载成功')
+        # model.saver.restore(sess, os.path.join(os.path.dirname(__file__)+'/model','ner_epoch0_f10.7740_loss1.2526.ckpt'))
+        model.saver.restore(sess, os.path.join(os.path.dirname(__file__)+'/model','ner_epoch16_f10.8000_loss1.0775.ckpt'))
+        print('模型加载成功')
 
 
-        for epoch in range(50):
+        for epoch in range(20,50):
+            t1 = time.time()
             for batch in range(batch_num):
             for batch in range(batch_num):
                 text_len_, word_ids_, tag_ids_ = sess.run([text_len, word_ids, tag_ids])
                 text_len_, word_ids_, tag_ids_ = sess.run([text_len, word_ids, tag_ids])
                 # print(text_len_.shape, word_ids_.shape, tag_ids_.shape)
                 # print(text_len_.shape, word_ids_.shape, tag_ids_.shape)
@@ -118,9 +149,10 @@ if __name__ == "__main__":
 
 
 
 
                 if batch % 100==0:
                 if batch % 100==0:
-                    print('loss_:', loss_, '\tglobel_step_:',globel_step_)
+                    logging.info('loss_:%.4f,\tglobel_step_: %d'%(loss_, globel_step_))
+                    print('耗时:', time.time()-t1)
                 num += text_len_.shape[0]
                 num += text_len_.shape[0]
-            print('训练数:%d, 样本总数:%d'%(num, total_num))
+            # print('训练数:%d, 样本总数:%d'%(num, total_num))
 
 
             results = []
             results = []
             trans = model.trans.eval()
             trans = model.trans.eval()
@@ -154,15 +186,19 @@ if __name__ == "__main__":
             recall = equal_num / (gold_num + 1e-10)
             recall = equal_num / (gold_num + 1e-10)
             f1 = 2 * (precision * recall) / (precision + recall + 1e-10)
             f1 = 2 * (precision * recall) / (precision + recall + 1e-10)
             val_loss = np.mean(loss)
             val_loss = np.mean(loss)
-            print('epoch: %d, f1:%.4f, acc:%.4f, recall:%.4f, val_loss:%.4f'%(epoch, f1, precision, recall, val_loss))
+            logging.info('epoch: %d, f1:%.4f, acc:%.4f, recall:%.4f, val_loss:%.4f'%(epoch, f1, precision, recall, val_loss))
             if f1>max_f1:
             if f1>max_f1:
                 max_f1 = f1
                 max_f1 = f1
                 model.saver.save(sess, os.path.join(os.path.dirname(__file__)+'/model', "ner_epoch%d_f1%.4f_loss%.4f.ckpt"%(epoch,f1, val_loss)))
                 model.saver.save(sess, os.path.join(os.path.dirname(__file__)+'/model', "ner_epoch%d_f1%.4f_loss%.4f.ckpt"%(epoch,f1, val_loss)))
-                print('save model, max_f1:%.4f' %f1)
+                logging.info('save model, max_f1:%.4f' %f1)
 
 
         coord.request_stop()
         coord.request_stop()
         coord.join(threads)
         coord.join(threads)
 
 
+if __name__ == "__main__":
+    train()
+
+
 
 
 
 
 
 

+ 8 - 4
BiddingKG/dl/time/re_servicetime.py

@@ -23,6 +23,7 @@ before = '(?P<before>' \
          '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
          '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
          '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
          '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
          '|交货时间|工期|质保期' \
          '|交货时间|工期|质保期' \
+         '|保洁期限|维保期|管理年限|工期承诺|(服务|合同|施工|实施|工程|设计)(年限|期限|周期|期:)' \
          '|服务期限为|计划工期|工期要求|服务期限|服务期' \
          '|服务期限为|计划工期|工期要求|服务期限|服务期' \
          '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
          '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
          '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|供货期|合同履行日期|计划周期' \
          '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|供货期|合同履行日期|计划周期' \
@@ -61,7 +62,7 @@ before2 = '(?P<before2>' \
         # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
         # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
 
 
 before3 = '(?P<before3>' \
 before3 = '(?P<before3>' \
-          '([\((]日历天[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
+          '([\((](日历天|施工时间)[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
           ')'
           ')'
 
 
 before4 = '(?P<before4>' \
 before4 = '(?P<before4>' \
@@ -98,7 +99,7 @@ number = '(?P<number>' \
          ')'
          ')'
 
 
 after = '(?P<after>' \
 after = '(?P<after>' \
-        '[个,,(\(]*(日历|工作|学|)([年月日天周]|周年|整年)(内|)|\)|)|' \
+        '[个,,(\(]*(日历|历天|工作|学|)([年月日天周]|周年|整年)(内|)|\)|)|' \
         ')'
         ')'
         # '|周|号|天|个月|个年|((|\(|)年()|\)|)|((|\(|)月()|\)|)|((|\(|)日()|\)|)' \
         # '|周|号|天|个月|个年|((|\(|)年()|\)|)|((|\(|)月()|\)|)|((|\(|)日()|\)|)' \
         # '|个日历天|日历天|\(日历天\)|\(天\)|周内|,日历天|工作日|个工作日|' \
         # '|个日历天|日历天|\(日历天\)|\(天\)|周内|,日历天|工作日|个工作日|' \
@@ -265,7 +266,7 @@ def filter_service_time(output_list, text_index_list):
         if not re.findall(reg_right_digit, output):
         if not re.findall(reg_right_digit, output):
             delete_list.append([output, text_index_list[i]])
             delete_list.append([output, text_index_list[i]])
             continue
             continue
-        if not re.findall(reg_right_unit, output):
+        if not re.findall(reg_right_unit, output) and not re.match('^\d{1,3}$', output):
             delete_list.append([output, text_index_list[i]])
             delete_list.append([output, text_index_list[i]])
             continue
             continue
         # 包含不要的字
         # 包含不要的字
@@ -352,7 +353,10 @@ def extract_servicetime(text):
 def test_from_str():
 def test_from_str():
     # s = """
     # s = """
     # """
     # """
-    s = "5元/年 服务期:交付使用之日起三年; 承诺服务等级"
+    # s = "5元/年 服务期:交付使用之日起三年; 承诺服务等级"
+    # s = "交货,1.交货时间:7天,2.交货地点:广东清远市清城区飞来峡镇人民政府高田应急安置点"
+    s = ''',莆田市财政局走廊及卫生间吊顶改造工程中标结果公告,莆田市财政局走廊及卫生间吊顶改造工程,工程预算价236878元,发包价194240元,招标编号为:宏福莆招字【2020】H001号,该项目招标方式为:邀请招标。2020年04月07日开标,2020年04月07日评标完成,中标主要结果公示如下:中标人名称,福建省东海伟业建设有限公司,中标价:194240元,评标办法,随机抽取法,资格评审结果,注册建造师:合格:余爱华(注册编号:闽235141578763),履约保证金(元):合格:合同金额的10%,施工工期:14日历天,工程质量,备注,被确定为废标、无效标的投标人及原因:合格:无废标,资格审查小组:合格:王宗仙、林慧灵、谢淑青,根据评标结果确定福建省东海伟业建设有限公司为中标人,现在莆田市财政局网上(http://czj.putian.gov.cn/)公示。中标公示期自2020年04月08日至2020年04月10日。投标人对中标结果有异议或认为评标活动存在违法违规行为,可在公示期内向相关主管部门投诉,招标单位:招标代理机构:莆田市财政局,福建省宏福工程管理有限公司,联系电话:0594-2694413,联系电话:15160467775,2020年04月08日,2020年04月08日,
+'''
     print(extract_servicetime(s))
     print(extract_servicetime(s))
     print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-~~起至到—]+\d{2,4}[-.年/]', s))
     print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-~~起至到—]+\d{2,4}[-.年/]', s))