Browse Source

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

luojiehua 1 year ago
parent
commit
251d20468e

+ 161 - 15
BiddingKG/dl/bidway/re_bidway.py

@@ -200,6 +200,7 @@ import re
 #
 #     return output_list[0], text_index_list[0]
 
+normal_bidway = "公开招标|邀请招标|竞争性谈判|竞争性磋商|单一来源|框架协议|询价"
 
 bidway = '单一来源' \
          '|国内竞争性磋商|竞争性磋商|竞争性谈判|网络竞价|网上竞价|公开竞谈|公开竞价|电子竞价|竞价|竞标|竞谈竞价|电子书面竞投' \
@@ -210,17 +211,17 @@ bidway = '单一来源' \
          '|网上询价|公开询价|非定向询价|定向询价|询比价|询单|询价|询比' \
          '|库内邀请|库内公开发包|内部邀标' \
          '|定点采购议价|定点采购' \
-         '|竞争性评审'
+         '|竞争性评审|框架协议'
 
 not_bidway = '及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录|自由竞价' \
              '|限时竞价|咨询单位|询价单'
 
-not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除"
+not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除|可以选择|包括|涉及|非"
 
 not_bidway_suffix = "文件|报名|邀请|项目|失败|数量|编号|后|时间|类型|名称|和|成交" \
                     "|标题|开始|结束|产品|报价|供应商|部门|监督|需求|范围|入围|内容|人" \
                     "|条件|公司|保证金|完毕|事件|成功|活动|地点|标|会|须知|范围" \
-                    "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败"
+                    "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败|小组"
 
 bidway_preffix = '采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式' \
                  '|发包方式|发包类型|开展方式|招标类型|选取方式|招租方式'
@@ -268,21 +269,64 @@ def re_standard_bidway(_str):
     bidway_list = []
     if match:
         for m in match:
-            m_dict = m.groupdict()
-            m_span = m.span()
-            keyword = ""
-            keyword_index = [m_span[0], m_span[1]]
-            for key in m_dict.keys():
-                if key == "value":
-                    keyword = m_dict.get(key)
-                else:
-                    keyword_index[0] += len(m_dict.get(key))
+            keyword = m.group('value')
+            keyword_index = list(m.span('value'))
+            behind_str = _str[m.start(): m.end()+30]
+            if len(re.findall(normal_bidway, behind_str))>1:
+                keyword = ''
+                for it in re.finditer('(?P<sign>.{1,2})(?P<bidway>'+normal_bidway+')+', behind_str): # 招标方式后面多个选择处理
+                    if '□' != it.group('sign')[-1]:
+                        keyword = it.group('bidway')
+                        keyword_index = [m.start()+it.start('bidway'), m.start()+it.end('bidway')]
+                        break
+             # m_dict = m.groupdict()
+            # m_span = m.span()
+            # keyword = ""
+            # keyword_index = [m_span[0], m_span[1]]
+            # for key in m_dict.keys():
+            #     if key == "value":
+            #         keyword = m_dict.get(key)
+            #     else:
+            #         keyword_index[0] += len(m_dict.get(key))
             bidway_list.append([keyword, keyword_index])
 
     return bidway_list
 
+def re_normal_bidway(_str):
+    ser = re.search("("+normal_bidway+")(转为?|变更为|更改为)"+"(?P<bidway>(" + normal_bidway + "))", _str) # 如果方式变更取变更后的
+    if ser:
+        return [[ser.group('bidway'), list(ser.span('bidway'))]]
+    reg_all = "(?P<value>" + normal_bidway + ")"
+    match = re.finditer(reg_all, _str)
+    bidway_list = []
+    bidway_set = set()
+    if match:
+        for m in match:
+            keyword = m.group()
+            if keyword == '公开招标' and m.start()>0 and _str[m.start()-1]=='非':
+                continue
+            keyword_index = list(m.span())
+            bidway_set.add(keyword)
+            bidway_list.append([keyword, keyword_index])
+    if len(bidway_list) == 0: # 如果找不到标准方式,匹配简称方式
+        ser = re.search('(?P<bidway>(磋商|谈判))(公告|成交|结果)', _str)
+        if ser:
+            return [[ser.group('bidway'), list(ser.span('bidway'))]]
+    if len(bidway_set) > 1: # 匹配到多种招标方式返回空
+        return []
+    return bidway_list
 
 def re_all_bidway(_str):
+    reg_all = "(?P<value>" + normal_bidway + ")" # 优先匹配规范的招标方式
+    match = re.finditer(reg_all, _str)
+    bidway_list = []
+    if match:
+        for m in match:
+            keyword = m.group()
+            keyword_index = list(m.span())
+            bidway_list.append([keyword, keyword_index])
+    return bidway_list
+
     reg_all = "(?P<value>" + bidway + ")"
     match = re.finditer(reg_all, _str)
     bidway_list = []
@@ -339,6 +383,13 @@ def get_one_word(bidway_list):
 
 
 def re_bidway(text, title):
+    # 优先匹配标题标准招标方式
+    if len(title)<100:
+        bidway_list = re_normal_bidway(title)
+        if bidway_list:
+            word, text_index = get_one_word(bidway_list)
+            return word, text_index
+
     # 替换易混淆词
     text_clean = re_not_bidway(text)
     title_clean = re_not_bidway(title)
@@ -406,12 +457,30 @@ bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
                '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
                '竞争性磋商': '竞争性磋商', '采购方式:邀请': '邀请招标',
                '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
-               '网上询价': '询价'}
+               '网上询价': '询价', '框架协议': '框架协议', '谈判':'竞争性谈判'}
 # bidway名称统一规范
 def bidway_integrate(bidway):
     integrate_name = bidway_dict.get(bidway,"其他")
     return integrate_name
 
+def bidway_normalize(key):
+    if re.search('公开招标|公开发包', key):
+        return '公开招标'
+    elif re.search('单一来源', key):
+        return '单一来源'
+    elif re.search('磋商', key):
+        return '竞争性磋商'
+    elif re.search('谈判', key):
+        return '竞争性谈判'
+    elif re.search('竞谈|竞价|竞投|竞标', key):
+        return '竞价'
+    elif re.search('询价|询比|比价|询单', key):
+        return '询价'
+    elif re.search('邀请|邀标', key):
+        return '邀请招标'
+    else:
+        return bidway_dict.get(key, '其他')
+
 def test_csv():
     df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")
 
@@ -456,13 +525,90 @@ def test_str():
 
 
 def test_html():
-    html_path = "C:/Users/Administrator/Desktop/3.html"
+    # html_path = "C:/Users/Administrator/Desktop/3.html"
+    html_path = 'd:/html/2.html'
 
-    with open(html_path, "r") as f:
+    with open(html_path, "r", encoding='utf-8') as f:
         s = f.read()
 
     print(extract_bidway(s, title=""))
 
+def get_valuate():
+    import psycopg2
+    conn = psycopg2.connect(host='192.168.2.103', port='5432', user='postgres', password='postgres', dbname='iepy')
+    cursor = conn.cursor()
+    sql = "select c1.docid, c1.doctitle, c1.extract_json, c2.text from corpus_otherinput c1 left join corpus_iedocument c2 on c1.docid=c2.human_identifier where c1.new_extract notnull;" # where docid='110635873'
+    # sql = "select c1.docid, c1.doctitle from corpus_otherinput c1;"
+    # sql = "select text from corpus_iedocument limit 50000;"
+    cursor.execute(sql)
+    datas = []
+    olds = []
+    news = []
+    label_old = []
+    label_new = []
+    labels = []
+    for row in cursor.fetchall():
+        docid = row[0]
+        doctitle = row[1]
+        ex = row[2]
+        text = row[3]
+        ser = re.search('"bidway": "(\w{,6})"', ex)
+        # print('ser:', ser)
+        old = ser.group(1) if ser else ""
+        pred = extract_bidway(text, title=doctitle)
+
+        # list_bidway = extract_bidway(text, title=doctitle)
+        # print('list_bidway', list_bidway)
+        # if list_bidway:
+        #     bidway = list_bidway[0].get("body")
+        #     # bidway名称统一规范
+        #     bidway = bidway_integrate(bidway)
+        # else:
+        #     bidway = ""
+        # print('bidway: ', bidway)
+
+        pred = pred[0]['body'] if len(pred) > 0 else ""
+        new = bidway_dict.get(pred, "其他") if pred!="" else ""
+        sql2 = "select value from brat_bratannotation where document_id='{0}' and value like '%bidway%' limit 4;".format(docid)
+        cursor.execute(sql2)
+        lb_new = docid + "_"
+        lb_old = docid + "_"
+        tmp_l = []
+        for row in cursor.fetchall():
+            lb = row[0].split()[-1]
+            lb = bidway_dict.get(lb, "其他")  # 新准确率:0.9642, 召回率: 0.9642, F1: 0.8965
+            # lb = bidway_normalize(lb)   # 旧准确率:0.9287, 召回率: 0.9287, F1: 0.8011  新准确率:0.9692, 召回率: 0.9692, F1: 0.9105
+
+            tmp_l.append(lb)
+            if lb == new:
+                lb_new = docid + "_" + lb
+            if lb == old:
+                lb_old = docid + "_" + lb
+        olds.append(docid + "_" + old)
+        news.append(docid + "_" + new)
+        label_new.append(lb_new)
+        label_old.append(lb_old)
+        labels.append(';'.join(tmp_l))
+        datas.append((docid, docid + "_" + old, lb_old, docid + "_" + new, lb_new, ';'.join(tmp_l)))
+
+    eq_old = len(set(olds)&set(label_old))
+    eq_new = len(set(news)&set(label_new))
+
+    acc_old = eq_old/len(set(olds))
+    recall_old = eq_old/len(set(label_old))
+    f1_old = acc_old*recall_old/2*(acc_old+recall_old)
+
+    acc_new = eq_new/len(set(news))
+    recall_new = eq_new/len(set(label_new))
+    f1_new = acc_new*recall_new/2*(acc_new+recall_new)
+    print('旧准确率:%.4f, 召回率: %.4f, F1: %.4f'%(acc_old, recall_old, f1_old))
+    print('新准确率:%.4f, 召回率: %.4f, F1: %.4f'%(acc_new, recall_new, f1_new))
+
+
+    df = pd.DataFrame(datas, columns=['docid', 'pred_old', 'label_old', 'pred_new', 'label_new', 'labels'])
+    df['old_pos'] = df.apply(lambda x:1 if x['pred_old']==x['label_old'] else 0, axis=1)
+    df['new_pos'] = df.apply(lambda x:1 if x['pred_new']==x['label_new'] else 0, axis=1)
+    df.to_csv('E:/其他数据/招标方式预测结果.csv', index=False)
 
 if __name__ == "__main__":
     # extract_bidway(s)

+ 70 - 17
BiddingKG/dl/common/Utils.py

@@ -305,6 +305,8 @@ def changeIndexFromWordToWords(tokens,word_index):
         if before_index<=word_index and after_index>word_index:
             return i
         before_index = after_index
+    return i+1
+
         
 def getIndexOfWords(words):
     global vocab_words,file_vocab_words
@@ -604,7 +606,18 @@ def fitDataByRule(data):
     result = re.sub("[。]","",result)
     return  result
 
-time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
+from datetime import date
+# 时间合法性判断
+def isValidDate(year, month, day):
+    try:
+        date(year, month, day)
+    except:
+        return False
+    else:
+        return True
+
+time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
+from BiddingKG.dl.ratio.re_ratio import getUnifyNum
 def timeFormat(_time):
     current_year = time.strftime("%Y",time.localtime())
     all_match = re.finditer(time_format_pattern,_time)
@@ -622,22 +635,48 @@ def timeFormat(_time):
                 if k=="day":
                     day = v
             if year!="":
-                if len(year)==2:
-                    year = "20"+year
-                if int(year)>int(current_year):
-                    legal = False
+                if re.search("^\d+$",year):
+                    if len(year)==2:
+                        year = "20"+year
+                    if int(year)>int(current_year):
+                        legal = False
+                else:
+                    _year = ""
+                    for word in year:
+                        if word == '0':
+                            _year += word
+                        else:
+                            _year += str(getDigitsDic(word))
+                    year = _year
             else:
                 legal = False
             if month!="":
-                if int(month)>12:
-                    legal = False
+                if re.search("^\d+$", month):
+                    if int(month)>12:
+                        legal = False
+                else:
+                    month = int(getUnifyNum(month))
+                    if month>=1 and month<=12:
+                        month = str(month)
+                    else:
+                        legal = False
             else:
                 legal = False
             if day!="":
-                if int(day)>31:
-                    legal = False
+                if re.search("^\d+$", day):
+                    if int(day)>31:
+                        legal = False
+                else:
+                    day = int(getUnifyNum(day))
+                    if day >= 1 and day <= 31:
+                        day = str(day)
+                    else:
+                        legal = False
             else:
                 legal = False
+            # print(year,month,day)
+            if not isValidDate(int(year),int(month),int(day)):
+                legal = False
             if legal:
                 return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
     return ""
@@ -797,23 +836,28 @@ def uniform_package_name(package_name):
     '''
     package_name_raw = package_name
     package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name)
+    package_name = package_name.replace('标段(包)', '标段').replace('№', '')
+    package_name = re.sub('\[|【', '', package_name)
     kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name)
     name = ""
     if kw:
         name += kw.group(0)
     if re.search('^[a-zA-Z0-9-]{5,}$', package_name):   # 五个字符以上编号
         _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
+        # print('规范化包号1', _digit)
         name += _digit
     elif re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
         ser = re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)
+        # print('规范化包号2', ser.group(0))
         _char = ser.groupdict().get('eng')
         if _char:
             _char = _char.upper()
         _digit = ser.groupdict().get('num')
         _digit = uniform_num(_digit)
         name += _char.upper() + _digit
-    elif re.search('第?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|([分子]?[包标]))', package_name): # 处理类似 A包2标段
-        ser = re.search('第?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|([分子]?[包标]))', package_name)
+    elif re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name): # 处理类似 A包2标段
+        ser = re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name)
+        # print('规范化包号3', ser.group(0))
         _char = ser.groupdict().get('eng')
         if _char:
             _char = _char.upper()
@@ -822,8 +866,9 @@ def uniform_package_name(package_name):
         if _char:
             name += _char.upper()
         name += _digit
-    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name):  # 数字的统一的阿拉伯数字
-        ser = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
+    elif re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name):  # 数字的统一的阿拉伯数字
+        ser = re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
+        # print('规范化包号4', ser.group(0))
         _char = ser.groupdict().get('eng')
         if _char:
             _char = _char.upper()
@@ -832,22 +877,28 @@ def uniform_package_name(package_name):
         if _char:
             name += _char.upper()
         name += _digit
-    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})', package_name):  # 数字的统一的阿拉伯数字
-        _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z]{1,4})', package_name).group('eng').upper()
+    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z-]{1,5})', package_name):  # 数字的统一的阿拉伯数字
+        _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z-]{1,5})', package_name).group('eng').upper()
+        # print('规范化包号5', _digit)
         name += _digit
     elif re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name):  # 数字的统一的阿拉伯数字
         _digit = re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper()
+        # print('规范化包号6', _digit)
         name += _digit
     elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name):  # 数字的统一的阿拉伯数字
         _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
+        # print('规范化包号7', _digit)
         _digit = uniform_num(_digit)
         name += _digit
     elif re.search('^[a-zA-Z0-9-]+$', package_name):
         _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0)
+        # print('规范化包号8', _char)
         name += _char.upper()
     if name == "":
         return package_name_raw
     else:
+        if name.isdigit():
+            name = str(int(name))
         # print('原始包号:%s, 处理后:%s'%(package_name, name))
         return name
 
@@ -860,11 +911,13 @@ def money_process(money_text, header):
     '''
     money = 0
     money_unit = ""
-    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", money_text)
+    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
+    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?", money_text)
     if re_price:
         money_text = re_price.group(0)
-        if '万元' in header and '万' not in money_text:
+        if re.search('万元|[((]万[))]',  header) and '万' not in money_text:  # 修复37797825 控制价(万)
             money_text += '万元'
+        # money = float(getUnifyMoney(money_text))
         money = float(getUnifyMoney(money_text))
         if money > 10000000000000:  # 大于万亿的去除
             money = 0

+ 3 - 3
BiddingKG/dl/foolnltk/selffool/lexical.py

@@ -100,7 +100,7 @@ class LexicalAnalyzer(object):
                 lb = label.split("_")[0]
 
                 if lb == "S":
-                    ens.append((i, i + 1, lt, word))
+                    ens.append((i-1, i, lt, word))
                 elif lb == "B":
                     entity = ""
                     entity += word
@@ -109,11 +109,11 @@ class LexicalAnalyzer(object):
 
                 elif lb == "E":
                     entity += word
-                    ens.append((i - len(entity), i + 1, lt, entity))
+                    ens.append((i - len(entity), i, lt, entity))
                     entity = ""
 
             if entity:
-                ens.append((i - len(entity), i + 1, lt, entity))
+                ens.append((i - len(entity), i, lt, entity))
             all_entitys.append(ens)
 
         return all_entitys

File diff suppressed because it is too large
+ 0 - 2
BiddingKG/dl/interface/Preprocessing.py


BIN
BiddingKG/dl/interface/agency_set.pkl


+ 77 - 3
BiddingKG/dl/interface/extract.py

@@ -13,6 +13,7 @@ import os
 import codecs
 import requests
 import time
+from unicodedata import normalize
 
 _time1 = time.time()
 sys.path.append(os.path.abspath("../.."))
@@ -114,11 +115,74 @@ def extractCount(extract_dict):
         extract_count += 1
     return extract_count
 
+# 字符编码标准化
+def str_normalize(text):
+    # time1 = time.time()
+    cn_punctuation = "¥,。:;{}!?()<"
+    text_split = re.split("([{}])+".format(cn_punctuation),text)
+    # print(text_split)
+    new_text = ""
+    for s in text_split:
+        if re.search("^[{}]+$".format(cn_punctuation),s):
+            new_text += s
+        else:
+            new_text += normalize('NFKD', s)
+    # print("str_normalize cost time %s"%str(time.time()-time1))
+    # print(new_text)
+
+    return new_text
+# 修复prem中地区前缀不完整实体
+def repair_entity(prem,district_dict,list_articles):
+    district_dict = district_dict['district']
+    province = district_dict['province'] if district_dict['province'] and district_dict['province'] not in ['未知','全国'] else ""
+    city = district_dict['city'] if district_dict['city'] and district_dict['city']!='未知' else ""
+    district = district_dict['district'] if district_dict['district'] and district_dict['district']!='未知' else ""
+    content_text = list_articles[0].content
+
+    autonomous_region_dict = {
+        "新疆":"新疆维吾尔",
+        "西藏":"西藏",
+        "内蒙古":"内蒙古",
+        "广西":"广西壮族",
+        "宁夏":"宁夏回族"
+    }
+
+    for package,_prem in prem[0]['prem'].items():
+        for role in _prem['roleList']:
+            if role['role_name'] in ['tenderee','agency']:
+                role_text = role['role_text']
+                if re.search("^[省市县区]",role_text):
+                    if role_text[0]=='省' and role_text[:2] not in ['省道']:
+                        role['role_text'] = province + role_text
+                    elif role_text[0]=='市' and role_text[:2] not in ['市政','市场']:
+                        if district+'市' in content_text:
+                            # 县级市
+                            role['role_text'] = district + role_text
+                        else:
+                            role['role_text'] = city + role_text
+                    elif role_text[0] in ['县','区']:
+                        role['role_text'] = district + role_text
+                elif re.search("^自治[区州县]",role_text):
+                    if role_text[:3]=='自治区':
+                        role['role_text'] = autonomous_region_dict.get(province,"") + role_text
+                    elif role_text[:3] in ['自治县',"自治州"]:
+                        if re.search("自治[县州]?$",district):
+                            role['role_text'] = re.sub("自治[县州]?","",district) + role_text
+                        elif re.search("族$",district):
+                            role['role_text'] = district + role_text
+                        elif re.search("自治[县州]?$",city):
+                            role['role_text'] = re.sub("自治[县州]?","",city) + role_text
+                        elif re.search("族$",city):
+                            role['role_text'] = city + role_text
+
+
 def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs):
     cost_time = dict()
 
     start_time = time.time()
     log("start process doc %s"%(str(doc_id)))
+    # 字符编码标准化
+    text = str_normalize(text)
     list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
     log("get preprocessed done of doc_id%s"%(doc_id))
     cost_time["preprocess"] = round(time.time()-start_time,2)
@@ -209,11 +273,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     '''表格要素提取'''
     table_prem = predictor.getPredictor("tableprem").predict(text, nlp_enterprise)
+    # print('表格提取中标人:', table_prem)
+    # print('原提取角色:', prem[0]['prem'])
     if table_prem:
         getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem)
 
     '''候选人提取'''
     candidate_top3_prem, candidate_dic = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
+    # print('表格提取候选人:', candidate_top3_prem)
     getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem)
 
     '''获取联合体信息'''
@@ -259,14 +326,21 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
     cost_time["district"] = round(time.time() - start_time, 2)
 
-    '''限制行业最高金额'''
-    getAttributes.limit_maximum_amount(prem, industry)
+    '''根据district提取结果修复实体'''
+    repair_entity(prem,district,list_articles)
+
+    # '''限制行业最高金额'''
+    # getAttributes.limit_maximum_amount(prem, industry) # 20230703取消,改为整合所有要素后面纠正
 
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-01-06'}
+    version_date = {'version_date': '2023-07-18'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
+
+    '''最终检查修正招标、中标金额'''
+    getAttributes.limit_maximum_amount(data_res, list_entitys[0])
+
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
     data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment

File diff suppressed because it is too large
+ 346 - 308
BiddingKG/dl/interface/getAttributes.py


BIN
BiddingKG/dl/interface/header_set.pkl


+ 5 - 5
BiddingKG/dl/interface/modelFactory.py

@@ -45,7 +45,7 @@ class Model_role_classify_word():
         if USE_PAI_EAS:
             lazyLoad = True
         #self.model_role_file = os.path.abspath("../role/log/ep071-loss0.107-val_loss0.122-f10.956.h5")
-        self.model_role_file = os.path.dirname(__file__)+"/../role/models/ep038-loss0.140-val_loss0.149-f10.947.h5"
+        # self.model_role_file = os.path.dirname(__file__)+"/../role/models/ep038-loss0.140-val_loss0.149-f10.947.h5"
         #self.model_role_file = os.path.abspath("../role/log/textcnn_ep017-loss0.088-val_loss0.125-f10.955.h5")
         self.model_role = None
         
@@ -64,9 +64,9 @@ class Model_role_classify_word():
               
               input0 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
               input1 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
-              input2 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
+              # input2 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
               output = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
-              self.model_role = [[input0,input1,input2],output]
+              self.model_role = [[input0,input1],output]  #,input2
         return self.model_role
     '''
     def load_weights(self):
@@ -75,9 +75,9 @@ class Model_role_classify_word():
     '''
     
     def encode(self,tokens,begin_index,end_index,entity_text,**kwargs):
-        _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=12,center_include=True,word_flag=True,text=entity_text)
+        _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=20,center_include=False,word_flag=True,text=entity_text) #size=12 center_include=True
         # print(_span)
-        _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False)
+        _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False) #  word_len=20
         # print(_encode_span)
         return _encode_span
     

File diff suppressed because it is too large
+ 453 - 150
BiddingKG/dl/interface/predictor.py


BIN
BiddingKG/dl/interface/product_savedmodel/productAndfailreason.pb


BIN
BiddingKG/dl/interface/role_savedmodel/saved_model.pb


BIN
BiddingKG/dl/interface/role_savedmodel/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/role_savedmodel/variables/variables.index


+ 15 - 10
BiddingKG/dl/money/moneySource/ruleExtra.py

@@ -10,8 +10,8 @@ def re_rule():
     data = pd.read_csv("C:\\Users\\admin\\Desktop\\alldata_error.csv", index_col=0)
 
     rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)" 
-              "(?P<moneySource>([^,,。;;已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
-              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
+              "(?P<moneySource>([^,,。;;已]{,20}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
+              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]{,20}(资本[金]|资金|自筹|贷款|补助|拨款|"
                "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
     num = 0
     moneySourceList = []
@@ -76,14 +76,14 @@ def re_rule():
 
 def extract_moneySource(text):
     rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
-                      "(?P<moneySource>([^,,。;;已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
-                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
+                      "(?P<moneySource>([^,,。;;已]{,30}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
+                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([::.、\d]+%)?[,,;;]?)?([^,,.。;;已]{,30}(资本[金]|资金|自筹|贷款|补助|拨款|"
                       "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[::.、\d]*%[,,;;]?)*)")
 
     re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已,,。.;;]|资[金佥]性质)")
     re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来,?源[为于]?|来自于?)?(,|,|;+|:+)?)"
-                     r"(?P<moneySource>[^,,。;;已]{2,}?)[,。;,]")
-    re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析")
+                     r"(?P<moneySource>[^,,。;;已]{4,}?)[,。;,]")
+    re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析|是否")
 
     sub = re.compile("[::。]|^[.,,、\)]|[,,;;]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
                      "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|"
@@ -115,6 +115,7 @@ def extract_moneySource(text):
                 # print(groupdict1)
                 if source1:
                     groupdict1["index"] = word_index
+                    groupdict1["prob"] = 0.9
                     # print(groupdict1['index'])
                     results.append(groupdict1)
             word_index += len(item)
@@ -127,8 +128,9 @@ def extract_moneySource(text):
                     groupdict2 = res.groupdict()
                     source2 = groupdict2['moneySource']
                     # print("source2==>",source2)
-                    if source2 and not re_error.search(source2):
+                    if source2 and not re_error.search(res.group()):
                         groupdict2["index"] = copy_index
+                        groupdict2["prob"] = 0.8
                         results.append(groupdict2)
                 copy_index += len(item)
     first = []
@@ -148,7 +150,7 @@ def extract_moneySource(text):
     for result in first:
         entity_text = sub.sub("",result['moneySource'])
         # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
-        if entity_text is None:
+        if entity_text is None or len(entity_text)>40:
             continue
         else:
             wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
@@ -158,6 +160,7 @@ def extract_moneySource(text):
             _moneySource['body'] = entity_text
             _moneySource['begin_index'] = wordOffset_begin
             _moneySource['end_index'] = wordOffset_end
+            _moneySource['prob'] = result['prob']
             # print(_moneySource)
             list_moneySource.append(_moneySource)
     return list_moneySource
@@ -166,7 +169,9 @@ def extract_moneySource(text):
 
 if __name__ == '__main__':
     # re_rule()
-    test ="a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。"
+    test ="a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。" \
+          "1、采购内容及资金来源:采购内容为汉上实验学校采购24台3匹柜机空调。资金来源为财政资金。"
+    # test = ",资金来源是否都是要具体到每条来源明细,"
     # 11,23 35,37
-    extract_moneySource(test)
+    print(extract_moneySource(test))
     pass

+ 294 - 51
BiddingKG/dl/product/data_tfrecord.py

@@ -10,6 +10,21 @@ import os
 import re
 import collections
 from BiddingKG.dl.product.data_util import word2id, max_id
+import psycopg2
+import json
+import pickle
+
+conn = psycopg2.connect(host='192.168.2.103', port='5432', user='postgres', password='postgres', dbname='bid_validate')
+cursor = conn.cursor()
+def get_title(docid):
+    sql = "select doctitle from qiao_ke_bao_raw where docid='{0}'".format(docid)
+    cursor.execute(sql)
+    for row in cursor.fetchall():
+        return row[0]
+    return ''
+
+product_notin = []
+
 max_len = 500
 
 def create_int_feature(values):
@@ -61,8 +76,11 @@ def fix_label_ner_句号分开(sentence, product_list, reasons_list):
 
 
 def create_instances_from_document_句号分开(docid, document_text, product_list, reasons_list):
-    for it in ['一','二','三','四','五','六','七','八','九','十','十一','十二','十三','十四','十五']:
-        document_text = document_text.replace(',%s、'%it, '。%s、'%it)
+    # for it in ['一','二','三','四','五','六','七','八','九','十','十一','十二','十三','十四','十五']:
+    #     document_text = document_text.replace(',%s、'%it, '。%s、'%it)
+    for it in re.finditer('[^\w\d。][一二三四五六七八九十]{1,3}、', document_text):
+        t = it.group(0)
+        document_text = document_text.replace(t, '。' + t[1:])
 
     if docid in ['docid']:
         pass
@@ -137,6 +155,10 @@ def fix_label_ner(sentence, product_list, reasons_list):
     tag_list = ['S'] * len(sentence)
     word_list = list(sentence)
     for product in product_list:
+        if len(re.sub('[^\w]', '', product))<1:
+            print('错误产品: ', product)
+            continue
+
         b = sentence.find(product)
         while b != -1:
             e = b + len(product)
@@ -158,10 +180,97 @@ def fix_label_ner(sentence, product_list, reasons_list):
             b = sentence.find(reason, e)
     return tag_list, word_list
 
+def fix_label_ner_remove_punctuation(sentence, product_list, reasons_list):
+    tag_list = ['S'] * len(sentence)
+    word_list = list(sentence)
+    if len(product_list)>0:
+        for it in re.finditer('|'.join(product_list), sentence):
+            b, e = it.span()
+            if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
+                tag_list[b] = 'B-pro'
+                tag_list[e - 1] = 'E-pro'
+                for i in range(b + 1, e - 1):
+                    tag_list[i] = 'I-pro'
+
+    for reason in reasons_list:
+        b = sentence.find(reason)
+        while b != -1:
+            e = b + len(reason)
+            if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
+                tag_list[b] = 'B-rea'
+                tag_list[e - 1] = 'E-rea'
+                for i in range(b + 1, e - 1):
+                    tag_list[i] = 'I-rea'
+            b = sentence.find(reason, e)
+    return tag_list, word_list
+
+def create_instances_from_document_remove_punctuation(docid, document_text, product_list, reasons_list):
+    product_list = set([re.sub('[^\w]', '', it) for it in product_list if len(re.sub('[^\w]', '', it))>1])  # 产品字段去掉符号
+    reasons_list = set([re.sub('[^\w]', '', it) for it in reasons_list if len(re.sub('[^\w]', '', it))>1])
+    document_text = re.sub('[^\w]', '', document_text)
+
+    product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
+    reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
+    kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?(原因|理由)', document_text)
+    if reasons_list == [] and kw_re:
+        document_text = re.sub('(流标|废标|终止|中止|失败|异常)的?(原因|理由).{, 30}', '', document_text)
+
+    pos = []
+    neg = []
+    if len(document_text)<= max_len:
+        document_text = document_text[:max_len]
+        tag_list, word_list = fix_label_ner_remove_punctuation(document_text, product_list, reasons_list)
+        if len(reasons_list)>0 and 'B-rea' not in tag_list:
+            print("少于%d字的文章废标原因标注未找到:%s"%(max_len, docid))
+        instance = TrainingInstance(word_list, tag_list)
+        if 'B-pro' in tag_list or 'E-rea' in tag_list:
+            pos.append(instance)
+        else:
+            neg.append(instance)
+    elif len(reasons_list)>0:
+        b = document_text.find(reasons_list[0])
+        if b != -1:
+            document_text = document_text[max(0, b-8):][:max_len]
+        else:
+            document_text = document_text[:max_len]
+            print("多于%d字的文章废标原因标注未找到:%s," % (max_len, docid))
+        tag_list, word_list = fix_label_ner_remove_punctuation(document_text, product_list, reasons_list)
+        if 'E-rea' not in tag_list:
+            print("文章废标原因标注未找到:%s, 开始位置:%d"%(docid, b))
+        instance = TrainingInstance(word_list, tag_list)
+        if 'B-pro' in tag_list or 'B-rea' in tag_list:
+            pos.append(instance)
+        else:
+            neg.append(instance)
+    else:
+        epoch = len(document_text)//max_len
+        if len(document_text)%max_len > 50:
+            epoch += 1
+        for i in range(epoch):
+            sentence = document_text[i*max_len: (i+1)*max_len]
+            if len(sentence)<5:
+                # print("句子长度小于5")
+                # print(sentence)
+                continue
+            sentence = sentence[:max_len]
+            tag_list, word_list = fix_label_ner_remove_punctuation(sentence, product_list, reasons_list)
+            instance = TrainingInstance(word_list, tag_list)
+            if 'B-pro' in tag_list or 'B-rea' in tag_list:
+                pos.append(instance)
+            else:
+                neg.append(instance)
+    random.shuffle(neg)
+    # neg = neg[:min(5, 10*len(pos))]
+    neg = neg[:min(5, 2*len(pos))]
+    instances = pos+neg
+    random.shuffle(instances)
+    return instances
+
 def create_instances_from_document(docid, document_text, product_list, reasons_list):
     product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
     reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
     kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?原因', document_text)
+
     if reasons_list == [] and kw_re:
         kw = kw_re.group(0)
         idx = document_text.find(kw)
@@ -196,10 +305,13 @@ def create_instances_from_document(docid, document_text, product_list, reasons_l
         else:
             neg.append(instance)
     else:
-        for it in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '十三', '十四', '十五']:
-            document_text = document_text.replace(',%s、' % it, '。%s、' % it)
+        # for it in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '十三', '十四', '十五']:
+        #     document_text = document_text.replace(',%s、' % it, '。%s、' % it)
+        for it in re.finditer('[^\w\d][一二三四五六七八九十]{1,3}、', document_text):
+            t = it.group(0)
+            document_text = document_text.replace(t, '。' + t[1:])
         for sentence in document_text.split('。'):
-            if len(sentence)<2:
+            if len(sentence)<5:
                 # print("句子长度小于5")
                 # print(sentence)
                 continue
@@ -249,7 +361,8 @@ def create_instances_from_document(docid, document_text, product_list, reasons_l
                 else:
                     neg.append(instance)
     random.shuffle(neg)
-    neg = neg[:min(5, 10*len(pos))]
+    # neg = neg[:min(5, 10*len(pos))]
+    neg = neg[:min(5, 2*len(pos))]
     instances = pos+neg
     random.shuffle(instances)
     return instances
@@ -259,37 +372,92 @@ def create_training_instances(df):
     # df = pd.read_excel(xlsx)
     df.fillna('', inplace=True)
     for i in df.index:
-        try:
-            docid = df.loc[i, 'docid']
-            document_text = df.loc[i, 'text']
-            product_list = json.loads(df.loc[i, 'lbset'])
-            reasons_list = json.loads(df.loc[i, 'reasons_list'])
-            # if reasons_list == []:
-            #     continue
-            instances.extend(
-                create_instances_from_document(
-                    docid, document_text, product_list, reasons_list
-                ))
-        except Exception as e:
-            print('json出错',i,  df.loc[i, 'lbset'], type(df.loc[i, 'lbset']), e)
+        if i % 5000==0:
+            print('create_instance', i)
+        # try:
+        docid = df.loc[i, 'docid']
+        document_text = df.loc[i, 'text']
+        product_list = json.loads(df.loc[i, 'lbset'])
+        reasons_list = json.loads(df.loc[i, 'reasons_list'])
+
+        notin_num = 0
+        for i in range(len(product_list)):  # 如果在公告找不到产品,尝试在中间加标点符号,
+            p = product_list[i]
+            if re.search('[^\w]', p) == None and re.search(p, document_text) == None:
+                ser = re.search('[^\w]{,2}'.join(p), document_text)
+                if ser:
+                    product_list[i] = ser.group(0)
+                elif '项目' in p and re.search(p.replace('项目', '采购项目'), document_text):
+                    product_list[i] = p.replace('项目', '采购项目')
+                elif '项目' in p and re.search(p.replace('项目', ''), document_text):
+                    product_list[i] = p.replace('项目', '')
+                elif re.search('[a-zA-Z]', p) and re.search(p.lower(), document_text):
+                    product_list[i] = p.lower()
+                elif re.search('[a-zA-Z]', p) and re.search(p.upper(), document_text.upper()):
+                    product_list[i] = p.upper()
+                    document_text = document_text.upper()
+                else:
+                    title = get_title(docid)
+                    if title not in document_text:
+                        document_text = title + "。" + document_text
+                        ser = re.search('[^\w]{,2}'.join(p), document_text)
+                        if ser:
+                            product_list[i] = ser.group(0)
+                        elif '项目' in p and re.search(p.replace('项目', '采购项目'), document_text):
+                            product_list[i] = p.replace('项目', '采购项目')
+                        elif '项目' in p and re.search(p.replace('项目', ''), document_text):
+                            product_list[i] = p.replace('项目', '')
+                        elif re.search('[a-zA-Z]', p) and re.search(p.lower(), document_text):
+                            product_list[i] = p.lower()
+                        elif re.search('[a-zA-Z]', p) and re.search(p.upper(), document_text.upper()):
+                            product_list[i] = p.upper()
+                            document_text = document_text.upper()
+                        else:
+                            # print('docid:%s,not in text product: %s' % (docid, p))
+                            notin_num += 1
+                            if re.search('业绩', document_text) == None:
+                                product_notin.append((docid, p))
+                    else:
+                        # print('docid:%s,not in text product: %s'%(docid, p))
+                        notin_num +=1
+                        if re.search('业绩', document_text) == None:
+                            product_notin.append((docid, p))
+        if notin_num > len(product_list)/2:
+            print('找到的产品少于一半,过滤掉', docid, product_list)
+            continue
+
+        # if reasons_list == []:
+        #     continue
+        instances.extend(
+            create_instances_from_document(
+                docid, document_text, product_list, reasons_list
+            ))
+        # instances.extend(
+        #     create_instances_from_document_remove_punctuation(
+        #         docid, document_text, product_list, reasons_list
+        #     ))
+        # except Exception as e:
+        #     print('json出错',i,  df.loc[i, 'lbset'], type(df.loc[i, 'lbset']), e)
     return instances
 
-def write_instance_to_example_files(instances, word2index, tag2index, output_dir):
+def write_instance_to_example_files(instances, word2index, tag2index, output_dir, tfrecode_name):
     # writers = []
     # instances = sorted(instances, key=lambda x: len(x.word_list))
     i = 0
     # for max_len in [200, 500, 1000]:
-    writer = tf.python_io.TFRecordWriter(output_dir + '/maxlen_%s_addunk_product_reason.tfrecode'%max_len)
+    # writer = tf.python_io.TFRecordWriter(output_dir + '/maxlen_%s_addunk_product_reason.tfrecode'%max_len)
+    writer = tf.python_io.TFRecordWriter(output_dir + '/%s'%tfrecode_name)
     # print('排序前:', [len(x.word_list) for x in instances[:5]])
     # instances.sort(key=lambda x:len(x.word_list), reverse=True)
     # print('排序后:', [len(x.word_list) for x in instances[:5]])
     while i < len(instances):
+        if i % 5000 == 0:
+            print('开始写入', i)
         instance = instances[i]
         if len(instance.word_list)>max_len:
             writer.close()
             break
         i += 1
-        # word_ids = [word2index.get(word, max_id) for word in instance.word_list]
         word_ids = [word2index.get(word, word2index.get('<unk>')) for word in instance.word_list]
         tag_ids = [tag2index.get(tag, 0) for tag in instance.tag_list]
         while len(word_ids)<max_len:
@@ -303,42 +471,117 @@ def write_instance_to_example_files(instances, word2index, tag2index, output_dir
         writer.write(tf_example.SerializeToString())
     writer.close()
 
+def 去除标注不在公告里面的公告(df):
+    df['notin'] = df.apply(
+        lambda x: json.dumps([it for it in json.loads(x['lbset']) if re.sub('[^\w]', '', it) not in re.sub('[^\w]', '', x['text'])],
+                             ensure_ascii=False), axis=1)
+    df = df[df['notin']=='[]']
+    return df
+
+
 if __name__ == "__main__":
-    df = pd.read_excel(os.path.dirname(__file__) + '/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
-    df['pos'] = df.apply(lambda x:1 if re.search('(流标|废标|终止|中止|失败|异常)(公告|公示)', x['text']) and x['reasons_list']=='[]' else 0, axis=1)
-    df = df[df.loc[:, 'pos']==0]  # 过滤掉未标注废标原因文章
-    df.reset_index(drop=True, inplace=True)
-    print('总文章数:',len(df))
-    df.fillna('', inplace=True)
-    print('读取完毕')
-    df['lbs'] = df['lbset'].apply(lambda x: json.loads(x))
-    lbset = [it for l in df['lbs'] for it in l]
-    c = collections.Counter(lbset)
-    m = c.most_common()
-    m3 = [it[0] for it in m if it[1] > 2]
-    df['pos'] = df['lbs'].apply(lambda x: 1 if len(set(m3) & set(x)) >= 1 else 0)
-    df_dev = df[df.loc[:, 'pos'] == 1].sample(frac=0.1, random_state=8)
-    print('len_df_dev:', len(df_dev))
-    df_reason = df[df.loc[:, 'reasons_list'] != '[]'].sample(frac=0.1, random_state=8)
-    print('len(df_reason)', len(df_reason))
-    df_dev.append(df_reason)
-    df_dev.drop_duplicates(subset=['docid'], inplace=True)
-    print('len_df_dev:', len(df_dev))
-    df_train = df[~df.index.isin(df_dev.index)]
-    print(len(df), len(df_dev), len(df_train))
-    df_train = df_train.sample(frac=1)
-    df_dev = df_dev.sample(frac=1)
-
-    # file = 'data/traindata.xlsx'
+    # df = pd.read_excel(os.path.dirname(__file__) + '/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
+
+    # df = pd.read_excel('E:/产品及失败原因标注数据/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
+    # # tfrecode_name = '20211125_ProductAndReason.tfrecode'
+    # df = df[['docid', 'text', 'lbset', 'reasons_list']]
+    #
+    # df1 = pd.read_excel('E:/产品及失败原因标注数据/桥客宝产品数据1.xlsx')
+    #
+    # # tfrecode_name = 'qiaokebao1_product.tfrecode'
+    # df2 = pd.read_csv('E:/产品及失败原因标注数据/桥客宝产品数据2.csv')
+    #
+    # # tfrecode_name = 'qiaokebao2_product.tfrecode'
+    # df3 = pd.read_csv('E:/产品及失败原因标注数据/桥客宝产品数据3.csv')
+    # df = df.append([df1, df2, df3], ignore_index=True)
+    #
+    # tfrecode_name = 'all_product.tfrecode'
+    #
+    # df = df[['docid', 'text', 'lbset', 'reasons_list']]
+    # df.fillna('', inplace=True)
+    # df['pos'] = df.apply(lambda x:1 if re.search('(流标|废标|终止|中止|失败|异常)(公告|公示)', x['text']) and x['reasons_list']=='[]' else 0, axis=1)
+    # df = df[df.loc[:, 'pos']==0]  # 过滤掉未标注废标原因文章
+    # df.reset_index(drop=True, inplace=True)
+    # print('总文章数:',len(df))
+    # df.fillna('', inplace=True)
+    # print('读取完毕')
+    # df['lbs'] = df['lbset'].apply(lambda x: json.loads(x))
+    # lbset = [it for l in df['lbs'] for it in l]
+    # c = collections.Counter(lbset)
+    # m = c.most_common()
+    # m3 = [it[0] for it in m if it[1] > 2]
+    # print('m3: ', m3[:20])
+    # df['pos'] = df['lbs'].apply(lambda x: 1 if len(set(m3) & set(x)) >= 1 else 0)
+    # print('sum(pos): ', sum(df['pos']))
+    # df_dev = df[df.loc[:, 'pos'] == 1].sample(frac=0.1, random_state=8)
+    # print('len_df_dev:', len(df_dev))
+    #
+    # df_reason = df[df.loc[:, 'reasons_list'] != '[]']
+    # if len(df_reason)>10:
+    #     df_reason = df_reason.sample(frac=0.1, random_state=8)
+    #     print('len(df_reason)', len(df_reason))
+    #     df_dev.append(df_reason)
+    # df_dev.drop_duplicates(subset=['docid'], inplace=True)
+    # print('len_df_dev:', len(df_dev))
+    #
+    # df_train = df[~df.index.isin(df_dev.index)]
+    # print(len(df), len(df_dev), len(df_train))
+    # df_train = df_train.sample(frac=1)
+    # df_dev = df_dev.sample(frac=1)
+
+
+    df_train = pd.read_csv('E:/产品及失败原因标注数据/df_train.csv')
+    print('读取完毕',len(df_train))
+    sp = len(df_train)//2
+    df_train = df_train[:sp]
+    tfrecode_name = 'ProductAndReason_2023-02-24_train1.tfrecode'
+    # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_train1.tfrecode'
     instances = create_training_instances(df_train)
+    del df_train
     # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
     tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
     output_dir = 'data/train_data'
-    write_instance_to_example_files(instances, word2id, tag2index, output_dir)
+    print('准备写入')
+    write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
+    print('完成1')
+    with open('E:\产品及失败原因标注数据/product_notin1.pkl', 'wb') as f:
+        pickle.dump(product_notin, f)
 
+    df_train = pd.read_csv('E:/产品及失败原因标注数据/df_train.csv')
+    print('读取完毕', len(df_train))
+    sp = len(df_train)//2
+    df_train = df_train[sp:]
+    tfrecode_name = 'ProductAndReason_2023-02-24_train2.tfrecode'
+    # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_train2.tfrecode'  # 去掉文本及产品里面的符号
+    instances = create_training_instances(df_train)
+    del df_train
+    # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
+    tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
+    output_dir = 'data/train_data'
+    print('准备写入')
+    write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
+    print('完成2')
+    with open('E:\产品及失败原因标注数据/product_notin2.pkl', 'wb') as f:
+        pickle.dump(product_notin, f)
+
+    df_dev = pd.read_csv('E:/产品及失败原因标注数据/df_dev.csv')
+
+    print('去除前', len(df_dev))
+    # df_dev = 去除标注不在公告里面的公告(df_dev)
+    # print('去除后', len(df_dev))
+    #
+    tfrecode_name = 'ProductAndReason_2023-02-24_dev.tfrecode'
+    # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_dev.tfrecode'
     instances = create_training_instances(df_dev)
+    del df_dev
     # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
     tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
     output_dir = 'data/test_data'
-    write_instance_to_example_files(instances, word2id, tag2index, output_dir)
-    print('全部写入成功!')
+    write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
+    print('全部写入成功!')
+    with open('E:\产品及失败原因标注数据/product_notin3.pkl', 'wb') as f:
+        pickle.dump(product_notin, f)
+
+
+cursor.close()
+conn.close()

+ 11 - 23
BiddingKG/dl/product/data_util.py

@@ -11,28 +11,16 @@ import numpy as np
 import pandas as pd
 from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word,viterbi_decode, load
 
-tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
-id_to_tag = {v:k for k,v in tag2index.items()}
-# id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
+tag2id = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
+id_to_tag = {v:k for k,v in tag2id.items()}
 
-word_model = getModel_word()
-vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
-word2id = {k: v for v, k in enumerate(vocab)}
+path1 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_vocab.pk"
+path2 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_w2v_matrix.pk"
+vocab = load(path1)
+matrix = load(path2)
 max_id = len(vocab)
+word2id = {k: v for v, k in enumerate(vocab)}
 
-# path1 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_vocab.pk"
-# path2 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_w2v_matrix.pk"
-# vocab = load(path1)
-# matrix = load(path2)
-# max_id = len(vocab)
-# word2id = {k: v for v, k in enumerate(vocab)}
-
-# vocab = ["<pad>"] + word_model.index2word+ ["<unk>"]
-# matrix = np.zeros((len(vocab), 60))
-# for i in range(1, len(vocab)-1):
-#     matrix[i] = word_model[vocab[i]]
-# max_id = len(vocab)
-# word2id = {k: v for v, k in enumerate(vocab)}
 
 def df2data(df):
     import pandas as pd
@@ -211,8 +199,8 @@ def process_data(sentences):
     :return: 数字化后的统一长度
     '''
     maxLen = max([len(sentence) for sentence in sentences])
-    tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
-    # tags = [[word2id.get(k, word2id.get('<unk>')) for k in sentence] for sentence in sentences]
+    # tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
+    tags = [[word2id.get(k, word2id.get('<unk>')) for k in sentence] for sentence in sentences]
     pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
     return pad_tags
 
@@ -225,8 +213,8 @@ def get_ner(BIE_tag):
 def decode(logits, lengths, matrix):
     paths = []
     small = -1000.0
-    start = np.asarray([[small]*4+[0]])
-    # start = np.asarray([[small]*7+[0]])
+    # start = np.asarray([[small]*4+[0]]) # 只有产品
+    start = np.asarray([[small]*7+[0]]) # 产品及失败原因
     for score, length in zip(logits, lengths):
         score = score[:length]
         pad = small * np.ones([length, 1])

+ 2 - 1
BiddingKG/dl/product/main.py

@@ -116,7 +116,8 @@ def save_model_pb():
     #
     # 把cpkt转为pb
 
-    input_checkpoint = "model/ner_epoch5_f10.6855_loss1.3800.ckpt"
+    # input_checkpoint = "model/ner_epoch5_f10.6855_loss1.3800.ckpt"
+    input_checkpoint = "model/ner_epoch22_f10.7923_loss1.1039.ckpt" #2023/4/6
     saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True)
     graph = tf.get_default_graph()  # 获得默认的图
     input_graph_def = graph.as_graph_def()  # 返回一个序列号

+ 79 - 0
BiddingKG/dl/product/predict.py

@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+@author: bidikeji
+@time: 2023/3/27 10:19
+"""
+from BiddingKG.dl.product.product_model import Product_Model
+import os
+import re
+import time
+import pandas as pd
+import tensorflow as tf
+os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
+def predict():
+    ckpt_path = "model"
+    import json
+    with tf.Session() as sess:
+        model = Product_Model()
+        sess.run(tf.global_variables_initializer())
+        ckpt = tf.train.get_checkpoint_state(ckpt_path)
+
+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch16_f10.8000_loss1.0775.ckpt')
+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch7_f10.7998_loss1.0508.ckpt')
+        model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch22_f10.7923_loss1.1039.ckpt') # 整理数据后再次训练
+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch18_f10.8000_loss1.1276.ckpt') # 新
+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch5_f10.6855_loss1.3800.ckpt') # 旧
+        t1 = time.time()
+
+        print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
+        # df = pd.read_csv(os.path.dirname(__file__) + '/data/df_test.csv') #../test/
+        df = pd.read_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx')
+        print('公告数量:', len(df))
+        df.fillna('', inplace=True)
+        # df = pd.read_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
+        df.reset_index(drop=True, inplace=True)
+        rs = []
+        for i in df.index:
+            text = df.loc[i, 'text']
+            # result = model.evaluate_line(sess, text)
+            # print(result[0][1])
+            # rs.append(json.dumps(result[0][1], ensure_ascii=False))
+
+            tmp = []
+            for line in text.split('。'):
+                # line = re.sub('[^\w]', '', line)
+                # if len(line) < 5:
+                #     continue
+                result = model.evaluate_line(sess, line)
+                # print(result[0][1])
+                tmp.extend(result[0][1])
+            rs.append(json.dumps(tmp, ensure_ascii=False))
+        df['predict_new'] = pd.Series(rs)
+        df.to_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx', index=False)
+        print('耗时: ', time.time()-t1)
+        return df
+
+def 统计准确率(df):
+    import json
+    # df = pd.read_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx')
+    df['pr'] = df['predict_new'].apply(lambda x:set([it[0] for it in json.loads(x)]))
+    df['lb'] = df['lbset'].apply(lambda x: set(json.loads(x)))
+    df['pos'] = df.apply(lambda x:1 if x['pr']==x['lb'] else 0, axis=1)
+    eq = lb = pr = 0
+    for i in df.index:
+        pred = df.loc[i, 'pr']
+        label = df.loc[i, 'lb']
+        lb += len(label)
+        pr += len(pred)
+        eq += len(pred&label)
+    acc = eq/pr
+    recall = eq/lb
+    f1 = acc*recall*2/(acc+recall)
+    print('准确率:%.4f,召回率:%.4f,F1:%.4f'%(acc, recall, f1))  # 准确率:0.6489,召回率:0.8402,F1:0.7323
+    # df.to_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx')
+
+if __name__ == "__main__":
+    df = predict()
+    统计准确率(df)

+ 63 - 27
BiddingKG/dl/product/train.py

@@ -9,6 +9,9 @@
 # @Time    : 2021/1/13 0013 10:12
 import os
 import re
+import time
+import logging
+logging.basicConfig(level=logging.DEBUG)
 print('准备导入tf')
 import tensorflow as tf
 print('准备导入np')
@@ -19,11 +22,14 @@ print('准备导入max_len')
 from BiddingKG.dl.product.data_tfrecord import max_len
 # from BiddingKG.dl.common.Utils import viterbi_decode
 print('准备设置CUDA环境')
-os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 # max_len = 500
+batch_size = 256
+MIN_AFTER_DEQUEUE = batch_size*500
 
-def read_tfRecord(sess, file_tfRecord):
-    queue = tf.train.string_input_producer([file_tfRecord])
+def read_tfRecord(sess, file_list):
+    # queue = tf.train.string_input_producer([file_tfRecord])
+    queue = tf.train.string_input_producer(file_list)
     reader = tf.TFRecordReader()
     filename_, serialized_example = reader.read(queue)
     features = tf.parse_single_example(
@@ -39,13 +45,13 @@ def read_tfRecord(sess, file_tfRecord):
     text_len = tf.cast(features['text_len'], tf.int64)
     return text_len, word_ids , tag_ids
 
-def get_batch_record(sess,filename, batch_size):
-    text_len, word_ids, tag_ids = read_tfRecord(sess, filename)
+def get_batch_record(sess,file_list, batch_size):
+    text_len, word_ids, tag_ids = read_tfRecord(sess, file_list)
     text_len, word_ids, tag_ids = tf.train.shuffle_batch([text_len, word_ids , tag_ids],
                                                      batch_size=batch_size,
-                                                     capacity=200+batch_size*3,
-                                                     min_after_dequeue=1,
-                                                     num_threads=5)
+                                                     capacity=MIN_AFTER_DEQUEUE+batch_size*3,
+                                                     min_after_dequeue=MIN_AFTER_DEQUEUE,
+                                                     num_threads=8)
     text_len = tf.squeeze(text_len, squeeze_dims=1)
     return text_len, word_ids , tag_ids
 
@@ -60,14 +66,36 @@ def total_sample(file_name):
         sample_num += 1
     return sample_num
 
-if __name__ == "__main__":
-    print('进入main ')
-    filename = os.path.dirname(__file__)+'/data/train_data/maxlen_500_addunk_product_reason.tfrecode'
-    filename_dev = os.path.dirname(__file__)+'/data/test_data/maxlen_500_addunk_product_reason.tfrecode'
-    assert os.path.exists(filename)
-    assert os.path.exists(filename_dev)
+def train():
+    logging.info('进入main ')
+    # filename = os.path.dirname(__file__)+'/data/train_data/maxlen_500_addunk_product_reason.tfrecode'
+    # filename_dev = os.path.dirname(__file__)+'/data/test_data/maxlen_500_addunk_product_reason.tfrecode'
+    # print('os.path.dirname(__file__): ', os.path.dirname(__file__))
+    # print('filename path :', filename)
+    # assert os.path.exists(filename)
+    # assert os.path.exists(filename_dev)
+
+    file_list = []
+    file_list_dev = []
+    train1 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-02-24_train1.tfrecode'
+    train2 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-02-24_train2.tfrecode'
+    dev1 = os.path.dirname(__file__)+'/data/test_data/ProductAndReason_2023-02-24_dev.tfrecode'
+
+    # train1 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-03-30_remove_punctuation_train1.tfrecode'
+    # train2 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-03-30_remove_punctuation_train2.tfrecode'
+    # dev1 = os.path.dirname(__file__)+'/data/test_data/ProductAndReason_2023-03-30_remove_punctuation_dev.tfrecode'
+
+    # print('filename path :', train1, os.path.exists(train1))
+
+    file_list.append(train1)
+    file_list.append(train2)
+
+    file_list_dev.append(dev1)
+
+
     print('确保文件存在')
-    batch_size = 100
+    print('filename path :', train1, os.path.exists(train1))
+    # batch_size = 512
     # id_to_tag = {0: 'O', 1: 'B', 2: 'I', 3: 'E'}
     tag2index = {'S': 0, 'B-pro': 1, 'I-pro': 2, 'E-pro': 3, 'B-rea': 4, 'I-rea': 5, 'E-rea': 6}
     id_to_tag = {v:k for k,v in tag2index.items()}
@@ -88,25 +116,28 @@ if __name__ == "__main__":
         init_op = tf.global_variables_initializer()
         sess.run(init_op)
         print('参数初始化')
-        text_len, word_ids, tag_ids = get_batch_record(sess, filename, batch_size=batch_size)
+        text_len, word_ids, tag_ids = get_batch_record(sess, file_list, batch_size=batch_size)
         print('get_batch_record')
-        text_len_dev, word_ids_dev, tag_ids_dev = get_batch_record(sess, filename_dev, batch_size=batch_size)
+        text_len_dev, word_ids_dev, tag_ids_dev = get_batch_record(sess, file_list_dev, batch_size=batch_size)
         print('get_batch_record_dev')
         coord = tf.train.Coordinator()
         threads = tf.train.start_queue_runners(coord=coord)
-        print('total_sample(filename)', total_sample(filename))
 
-        total_num = total_sample(filename)
+        total_num = sum([total_sample(filename) for filename in file_list])
+        logging.info('total_train_num: %d'%total_num)
         batch_num = total_num//batch_size
-        batch_num_dev = total_sample(filename_dev)//batch_size
+        batch_num_dev = sum([total_sample(filename_dev) for filename_dev in file_list_dev])//batch_size
         num = 0
         l = []
-        max_f1 = 0
 
+        max_f1 = 0.79
         # model.saver.restore(sess, os.path.join(os.path.dirname(__file__)+'/model','ner_epoch10_f10.6875_loss1.5230.ckpt'))
-        # print('模型加载成功')
+        # model.saver.restore(sess, os.path.join(os.path.dirname(__file__)+'/model','ner_epoch0_f10.7740_loss1.2526.ckpt'))
+        model.saver.restore(sess, os.path.join(os.path.dirname(__file__)+'/model','ner_epoch16_f10.8000_loss1.0775.ckpt'))
+        print('模型加载成功')
 
-        for epoch in range(50):
+        for epoch in range(20,50):
+            t1 = time.time()
             for batch in range(batch_num):
                 text_len_, word_ids_, tag_ids_ = sess.run([text_len, word_ids, tag_ids])
                 # print(text_len_.shape, word_ids_.shape, tag_ids_.shape)
@@ -118,9 +149,10 @@ if __name__ == "__main__":
 
 
                 if batch % 100==0:
-                    print('loss_:', loss_, '\tglobel_step_:',globel_step_)
+                    logging.info('loss_:%.4f,\tglobel_step_: %d'%(loss_, globel_step_))
+                    print('耗时:', time.time()-t1)
                 num += text_len_.shape[0]
-            print('训练数:%d, 样本总数:%d'%(num, total_num))
+            # print('训练数:%d, 样本总数:%d'%(num, total_num))
 
             results = []
             trans = model.trans.eval()
@@ -154,15 +186,19 @@ if __name__ == "__main__":
             recall = equal_num / (gold_num + 1e-10)
             f1 = 2 * (precision * recall) / (precision + recall + 1e-10)
             val_loss = np.mean(loss)
-            print('epoch: %d, f1:%.4f, acc:%.4f, recall:%.4f, val_loss:%.4f'%(epoch, f1, precision, recall, val_loss))
+            logging.info('epoch: %d, f1:%.4f, acc:%.4f, recall:%.4f, val_loss:%.4f'%(epoch, f1, precision, recall, val_loss))
             if f1>max_f1:
                 max_f1 = f1
                 model.saver.save(sess, os.path.join(os.path.dirname(__file__)+'/model', "ner_epoch%d_f1%.4f_loss%.4f.ckpt"%(epoch,f1, val_loss)))
-                print('save model, max_f1:%.4f' %f1)
+                logging.info('save model, max_f1:%.4f' %f1)
 
         coord.request_stop()
         coord.join(threads)
 
+if __name__ == "__main__":
+    train()
+
+
 
 
 

+ 3 - 2
BiddingKG/dl/ratio/re_ratio.py

@@ -3,7 +3,7 @@ from decimal import Decimal
 # ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)'
 # ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[))]?[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)'
 
-ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率|优惠率)([((]?[%‰][))]?|)(报价|取值|)([((].{1,20}[))])?[))]?[为是:: ,]{0,3}'
+ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率|折扣系数|优惠率)([((]?[%‰][))]?|)(报价|取值|)([((].{1,20}[))])?[))]?[为是:: ,]{0,3}'
                    '([0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰]?[))]?|[百千]分之[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+(?:点[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+)?)'
                    '|[0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰][))]?[((]?[\u4e00-\u9fa5]{,2}(?:费率|折扣率|优惠率|(上浮|下浮)费?率)[))]?)')
 ratio = ratio.pattern
@@ -182,7 +182,7 @@ def getUnifyNum(money):
                         result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
                 # subMoneys[0]中无金额单位,不可再拆分
                 elif subMoneys[0] == "":
-                    result += 0
+                    result += getMultipleFactor(factorUnit)
                 elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
                     # print(subMoneys)
                     # subMoneys[0] = subMoneys[0][0]
@@ -223,6 +223,7 @@ def test_str():
 费率):12
 折扣率(%):99.2063
 投标报价:96.00%(折扣率
+折扣系数:86(%)
 '''
     # s = '下浮率 百分之十点零陆(10.00%'
     print(extract_ratio(s))

BIN
BiddingKG/dl/table_head/best_tiny.hdf5


BIN
BiddingKG/dl/table_head/best_tiny_230628.hdf5


+ 10 - 6
BiddingKG/dl/time/re_servicetime.py

@@ -20,9 +20,10 @@ TEST_MODE = False
 
 before = '(?P<before>' \
          '合同期限|工期/交货期/服务期|工期,|工期\(交货期\)|合格工期|服务期限|工期' \
-         '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
+         '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
          '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
-         '|交货时间|工期|质保期' \
+         '|交货时间|工期' \
+         '|保洁期限|维保期|管理年限|工期承诺|(服务|合同|施工|实施|工程|设计)(年限|期限|周期|期:)' \
          '|服务期限为|计划工期|工期要求|服务期限|服务期' \
          '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
          '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|供货期|合同履行日期|计划周期' \
@@ -61,7 +62,7 @@ before2 = '(?P<before2>' \
         # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
 
 before3 = '(?P<before3>' \
-          '([\((]日历天[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
+          ',?([\((](日历天|施工时间)[\))]|[\((]天[\))]|[\((]年[\))]|[\((]月[\))])?' \
           ')'
 
 before4 = '(?P<before4>' \
@@ -98,7 +99,7 @@ number = '(?P<number>' \
          ')'
 
 after = '(?P<after>' \
-        '[个,,(\(]*(日历|工作|学|)([年月日天周]|周年|整年)(内|)|\)|)|' \
+        '[个,,(\(]*(日历|历天|工作|学|)([年月日天周]|周年|整年)(内|)|\)|)|' \
         ')'
         # '|周|号|天|个月|个年|((|\(|)年()|\)|)|((|\(|)月()|\)|)|((|\(|)日()|\)|)' \
         # '|个日历天|日历天|\(日历天\)|\(天\)|周内|,日历天|工作日|个工作日|' \
@@ -265,7 +266,7 @@ def filter_service_time(output_list, text_index_list):
         if not re.findall(reg_right_digit, output):
             delete_list.append([output, text_index_list[i]])
             continue
-        if not re.findall(reg_right_unit, output):
+        if not re.findall(reg_right_unit, output) and not re.match('^\d{1,3}$', output):
             delete_list.append([output, text_index_list[i]])
             continue
         # 包含不要的字
@@ -352,7 +353,10 @@ def extract_servicetime(text):
 def test_from_str():
     # s = """
     # """
-    s = "5元/年 服务期:交付使用之日起三年; 承诺服务等级"
+    # s = "5元/年 服务期:交付使用之日起三年; 承诺服务等级"
+    # s = "交货,1.交货时间:7天,2.交货地点:广东清远市清城区飞来峡镇人民政府高田应急安置点"
+    s = ''',莆田市财政局走廊及卫生间吊顶改造工程中标结果公告,莆田市财政局走廊及卫生间吊顶改造工程,工程预算价236878元,发包价194240元,招标编号为:宏福莆招字【2020】H001号,该项目招标方式为:邀请招标。2020年04月07日开标,2020年04月07日评标完成,中标主要结果公示如下:中标人名称,福建省东海伟业建设有限公司,中标价:194240元,评标办法,随机抽取法,资格评审结果,注册建造师:合格:余爱华(注册编号:闽235141578763),履约保证金(元):合格:合同金额的10%,施工工期:14日历天,工程质量,备注,被确定为废标、无效标的投标人及原因:合格:无废标,资格审查小组:合格:王宗仙、林慧灵、谢淑青,根据评标结果确定福建省东海伟业建设有限公司为中标人,现在莆田市财政局网上(http://czj.putian.gov.cn/)公示。中标公示期自2020年04月08日至2020年04月10日。投标人对中标结果有异议或认为评标活动存在违法违规行为,可在公示期内向相关主管部门投诉,招标单位:招标代理机构:莆田市财政局,福建省宏福工程管理有限公司,联系电话:0594-2694413,联系电话:15160467775,2020年04月08日,2020年04月08日,
+'''
     print(extract_servicetime(s))
     print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-~~起至到—]+\d{2,4}[-.年/]', s))
 

+ 1 - 0
BiddingKG/readme/start.md

@@ -9,6 +9,7 @@ cd /data/python
 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
 #启动接口
 nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+nohup gunicorn --workers 3 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 192.168.2.102:15030 run_extract_server:app > extract.log 2>&1 &
 #nohup /data/anaconda3/envs/py37/bin/python run_extract_server.py >> extract.log port=15030 worker=14 &
 
 #19022启动要素提取接口

+ 2 - 2
BiddingKG/run_extract_server.py

@@ -81,7 +81,7 @@ def run_thread(data,list_result):
     web_source_no = data.get("web_source_no","")
     web_source_name = data.get("web_source_name","")
     original_docchannel = data.get("original_docchannel","")
-    print("web_source_name:",web_source_name)
+    # print("web_source_name:",web_source_name)
     is_fail = False
     try:
         if _content!="":
@@ -98,7 +98,7 @@ def run_thread(data,list_result):
     # 以json形式返回结果
     #_resp = json.dumps(data_res,cls=MyEncoder)
     #log(str(data["flag"])+str(data))
-    log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
+    # log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
     list_result.append(data_res)
     if is_fail:
         list_result.append(is_fail)

Some files were not shown because too many files changed in this diff