1 year ago · 251d20468e
--- a/BiddingKG/dl/bidway/re_bidway.py
+++ b/BiddingKG/dl/bidway/re_bidway.py
@@ -200,6 +200,7 @@ import re
 
				 #
			
 
				 #     return output_list[0], text_index_list[0]
			
 
				 
			
 
				+normal_bidway = "公开招标|邀请招标|竞争性谈判|竞争性磋商|单一来源|框架协议|询价"
			
 
				 
			
 
				 bidway = '单一来源' \
			
 
				          '|国内竞争性磋商|竞争性磋商|竞争性谈判|网络竞价|网上竞价|公开竞谈|公开竞价|电子竞价|竞价|竞标|竞谈竞价|电子书面竞投' \
			
@@ -210,17 +211,17 @@ bidway = '单一来源' \
 
				          '|网上询价|公开询价|非定向询价|定向询价|询比价|询单|询价|询比' \
			
 
				          '|库内邀请|库内公开发包|内部邀标' \
			
 
				          '|定点采购议价|定点采购' \
			
 
				-         '|竞争性评审'
			
 
				+         '|竞争性评审|框架协议'
			
 
				 
			
 
				 not_bidway = '及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录|自由竞价' \
			
 
				              '|限时竞价|咨询单位|询价单'
			
 
				 
			
 
				-not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除"
			
 
				+not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除|可以选择|包括|涉及|非"
			
 
				 
			
 
				 not_bidway_suffix = "文件|报名|邀请|项目|失败|数量|编号|后|时间|类型|名称|和|成交" \
			
 
				                     "|标题|开始|结束|产品|报价|供应商|部门|监督|需求|范围|入围|内容|人" \
			
 
				                     "|条件|公司|保证金|完毕|事件|成功|活动|地点|标|会|须知|范围" \
			
 
				-                    "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败"
			
 
				+                    "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败|小组"
			
 
				 
			
 
				 bidway_preffix = '采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式' \
			
 
				                  '|发包方式|发包类型|开展方式|招标类型|选取方式|招租方式'
			
@@ -268,21 +269,64 @@ def re_standard_bidway(_str):
 
				     bidway_list = []
			
 
				     if match:
			
 
				         for m in match:
			
 
				-            m_dict = m.groupdict()
			
 
				-            m_span = m.span()
			
 
				-            keyword = ""
			
 
				-            keyword_index = [m_span[0], m_span[1]]
			
 
				-            for key in m_dict.keys():
			
 
				-                if key == "value":
			
 
				-                    keyword = m_dict.get(key)
			
 
				-                else:
			
 
				-                    keyword_index[0] += len(m_dict.get(key))
			
 
				+            keyword = m.group('value')
			
 
				+            keyword_index = list(m.span('value'))
			
 
				+            behind_str = _str[m.start(): m.end()+30]
			
 
				+            if len(re.findall(normal_bidway, behind_str))>1:
			
 
				+                keyword = ''
			
 
				+                for it in re.finditer('(?P<sign>.{1,2})(?P<bidway>'+normal_bidway+')+', behind_str): # 招标方式后面多个选择处理
			
 
				+                    if '□' != it.group('sign')[-1]:
			
 
				+                        keyword = it.group('bidway')
			
 
				+                        keyword_index = [m.start()+it.start('bidway'), m.start()+it.end('bidway')]
			
 
				+                        break
			
 
				+             # m_dict = m.groupdict()
			
 
				+            # m_span = m.span()
			
 
				+            # keyword = ""
			
 
				+            # keyword_index = [m_span[0], m_span[1]]
			
 
				+            # for key in m_dict.keys():
			
 
				+            #     if key == "value":
			
 
				+            #         keyword = m_dict.get(key)
			
 
				+            #     else:
			
 
				+            #         keyword_index[0] += len(m_dict.get(key))
			
 
				             bidway_list.append([keyword, keyword_index])
			
 
				 
			
 
				     return bidway_list
			
 
				 
			
 
				+def re_normal_bidway(_str):
			
 
				+    ser = re.search("("+normal_bidway+")(转为?|变更为|更改为)"+"(?P<bidway>(" + normal_bidway + "))", _str) # 如果方式变更取变更后的
			
 
				+    if ser:
			
 
				+        return [[ser.group('bidway'), list(ser.span('bidway'))]]
			
 
				+    reg_all = "(?P<value>" + normal_bidway + ")"
			
 
				+    match = re.finditer(reg_all, _str)
			
 
				+    bidway_list = []
			
 
				+    bidway_set = set()
			
 
				+    if match:
			
 
				+        for m in match:
			
 
				+            keyword = m.group()
			
 
				+            if keyword == '公开招标' and m.start()>0 and _str[m.start()-1]=='非':
			
 
				+                continue
			
 
				+            keyword_index = list(m.span())
			
 
				+            bidway_set.add(keyword)
			
 
				+            bidway_list.append([keyword, keyword_index])
			
 
				+    if len(bidway_list) == 0: # 如果找不到标准方式，匹配简称方式
			
 
				+        ser = re.search('(?P<bidway>(磋商|谈判))(公告|成交|结果)', _str)
			
 
				+        if ser:
			
 
				+            return [[ser.group('bidway'), list(ser.span('bidway'))]]
			
 
				+    if len(bidway_set) > 1: # 匹配到多种招标方式返回空
			
 
				+        return []
			
 
				+    return bidway_list
			
 
				 
			
 
				 def re_all_bidway(_str):
			
 
				+    reg_all = "(?P<value>" + normal_bidway + ")" # 优先匹配规范的招标方式
			
 
				+    match = re.finditer(reg_all, _str)
			
 
				+    bidway_list = []
			
 
				+    if match:
			
 
				+        for m in match:
			
 
				+            keyword = m.group()
			
 
				+            keyword_index = list(m.span())
			
 
				+            bidway_list.append([keyword, keyword_index])
			
 
				+    return bidway_list
			
 
				+
			
 
				     reg_all = "(?P<value>" + bidway + ")"
			
 
				     match = re.finditer(reg_all, _str)
			
 
				     bidway_list = []
			
@@ -339,6 +383,13 @@ def get_one_word(bidway_list):
 
				 
			
 
				 
			
 
				 def re_bidway(text, title):
			
 
				+    # 优先匹配标题标准招标方式
			
 
				+    if len(title)<100:
			
 
				+        bidway_list = re_normal_bidway(title)
			
 
				+        if bidway_list:
			
 
				+            word, text_index = get_one_word(bidway_list)
			
 
				+            return word, text_index
			
 
				+
			
 
				     # 替换易混淆词
			
 
				     text_clean = re_not_bidway(text)
			
 
				     title_clean = re_not_bidway(title)
			
@@ -406,12 +457,30 @@ bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
 
				                '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
			
 
				                '竞争性磋商': '竞争性磋商', '采购方式：邀请': '邀请招标',
			
 
				                '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
			
 
				-               '网上询价': '询价'}
			
 
				+               '网上询价': '询价', '框架协议': '框架协议', '谈判':'竞争性谈判'}
			
 
				 # bidway名称统一规范
			
 
				 def bidway_integrate(bidway):
			
 
				     integrate_name = bidway_dict.get(bidway,"其他")
			
 
				     return integrate_name
			
 
				 
			
 
				+def bidway_normalize(key):
			
 
				+    if re.search('公开招标|公开发包', key):
			
 
				+        return '公开招标'
			
 
				+    elif re.search('单一来源', key):
			
 
				+        return '单一来源'
			
 
				+    elif re.search('磋商', key):
			
 
				+        return '竞争性磋商'
			
 
				+    elif re.search('谈判', key):
			
 
				+        return '竞争性谈判'
			
 
				+    elif re.search('竞谈|竞价|竞投|竞标', key):
			
 
				+        return '竞价'
			
 
				+    elif re.search('询价|询比|比价|询单', key):
			
 
				+        return '询价'
			
 
				+    elif re.search('邀请|邀标', key):
			
 
				+        return '邀请招标'
			
 
				+    else:
			
 
				+        return bidway_dict.get(key, '其他')
			
 
				+
			
 
				 def test_csv():
			
 
				     df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")
			
 
				 
			
@@ -456,13 +525,90 @@ def test_str():
 
				 
			
 
				 
			
 
				 def test_html():
			
 
				-    html_path = "C:/Users/Administrator/Desktop/3.html"
			
 
				+    # html_path = "C:/Users/Administrator/Desktop/3.html"
			
 
				+    html_path = 'd:/html/2.html'
			
 
				 
			
 
				-    with open(html_path, "r") as f:
			
 
				+    with open(html_path, "r", encoding='utf-8') as f:
			
 
				         s = f.read()
			
 
				 
			
 
				     print(extract_bidway(s, title=""))
			
 
				 
			
 
				+def get_valuate():
			
 
				+    import psycopg2
			
 
				+    conn = psycopg2.connect(host='192.168.2.103', port='5432', user='postgres', password='postgres', dbname='iepy')
			
 
				+    cursor = conn.cursor()
			
 
				+    sql = "select c1.docid, c1.doctitle, c1.extract_json, c2.text from corpus_otherinput c1 left join corpus_iedocument c2 on c1.docid=c2.human_identifier where c1.new_extract notnull;" # where docid='110635873'
			
 
				+    # sql = "select c1.docid, c1.doctitle from corpus_otherinput c1;"
			
 
				+    # sql = "select text from corpus_iedocument limit 50000;"
			
 
				+    cursor.execute(sql)
			
 
				+    datas = []
			
 
				+    olds = []
			
 
				+    news = []
			
 
				+    label_old = []
			
 
				+    label_new = []
			
 
				+    labels = []
			
 
				+    for row in cursor.fetchall():
			
 
				+        docid = row[0]
			
 
				+        doctitle = row[1]
			
 
				+        ex = row[2]
			
 
				+        text = row[3]
			
 
				+        ser = re.search('"bidway": "(\w{,6})"', ex)
			
 
				+        # print('ser:', ser)
			
 
				+        old = ser.group(1) if ser else ""
			
 
				+        pred = extract_bidway(text, title=doctitle)
			
 
				+
			
 
				+        # list_bidway = extract_bidway(text, title=doctitle)
			
 
				+        # print('list_bidway', list_bidway)
			
 
				+        # if list_bidway:
			
 
				+        #     bidway = list_bidway[0].get("body")
			
 
				+        #     # bidway名称统一规范
			
 
				+        #     bidway = bidway_integrate(bidway)
			
 
				+        # else:
			
 
				+        #     bidway = ""
			
 
				+        # print('bidway: ', bidway)
			
 
				+
			
 
				+        pred = pred[0]['body'] if len(pred) > 0 else ""
			
 
				+        new = bidway_dict.get(pred, "其他") if pred!="" else ""
			
 
				+        sql2 = "select value from brat_bratannotation where document_id='{0}' and value like '%bidway%' limit 4;".format(docid)
			
 
				+        cursor.execute(sql2)
			
 
				+        lb_new = docid + "_"
			
 
				+        lb_old = docid + "_"
			
 
				+        tmp_l = []
			
 
				+        for row in cursor.fetchall():
			
 
				+            lb = row[0].split()[-1]
			
 
				+            lb = bidway_dict.get(lb, "其他")  # 新准确率：0.9642, 召回率： 0.9642, F1: 0.8965
			
 
				+            # lb = bidway_normalize(lb)   # 旧准确率：0.9287, 召回率： 0.9287, F1: 0.8011  新准确率：0.9692, 召回率： 0.9692, F1: 0.9105
			
 
				+
			
 
				+            tmp_l.append(lb)
			
 
				+            if lb == new:
			
 
				+                lb_new = docid + "_" + lb
			
 
				+            if lb == old:
			
 
				+                lb_old = docid + "_" + lb
			
 
				+        olds.append(docid + "_" + old)
			
 
				+        news.append(docid + "_" + new)
			
 
				+        label_new.append(lb_new)
			
 
				+        label_old.append(lb_old)
			
 
				+        labels.append('；'.join(tmp_l))
			
 
				+        datas.append((docid, docid + "_" + old, lb_old, docid + "_" + new, lb_new, '；'.join(tmp_l)))
			
 
				+
			
 
				+    eq_old = len(set(olds)&set(label_old))
			
 
				+    eq_new = len(set(news)&set(label_new))
			
 
				+
			
 
				+    acc_old = eq_old/len(set(olds))
			
 
				+    recall_old = eq_old/len(set(label_old))
			
 
				+    f1_old = acc_old*recall_old/2*(acc_old+recall_old)
			
 
				+
			
 
				+    acc_new = eq_new/len(set(news))
			
 
				+    recall_new = eq_new/len(set(label_new))
			
 
				+    f1_new = acc_new*recall_new/2*(acc_new+recall_new)
			
 
				+    print('旧准确率：%.4f, 召回率： %.4f, F1: %.4f'%(acc_old, recall_old, f1_old))
			
 
				+    print('新准确率：%.4f, 召回率： %.4f, F1: %.4f'%(acc_new, recall_new, f1_new))
			
 
				+
			
 
				+
			
 
				+    df = pd.DataFrame(datas, columns=['docid', 'pred_old', 'label_old', 'pred_new', 'label_new', 'labels'])
			
 
				+    df['old_pos'] = df.apply(lambda x:1 if x['pred_old']==x['label_old'] else 0, axis=1)
			
 
				+    df['new_pos'] = df.apply(lambda x:1 if x['pred_new']==x['label_new'] else 0, axis=1)
			
 
				+    df.to_csv('E:/其他数据/招标方式预测结果.csv', index=False)
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     # extract_bidway(s)
			
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -305,6 +305,8 @@ def changeIndexFromWordToWords(tokens,word_index):
 
				         if before_index<=word_index and after_index>word_index:
			
 
				             return i
			
 
				         before_index = after_index
			
 
				+    return i+1
			
 
				+
			
 
				         
			
 
				 def getIndexOfWords(words):
			
 
				     global vocab_words,file_vocab_words
			
@@ -604,7 +606,18 @@ def fitDataByRule(data):
 
				     result = re.sub("[。]","",result)
			
 
				     return  result
			
 
				 
			
 
				-time_format_pattern = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2}))")
			
 
				+from datetime import date
			
 
				+# 时间合法性判断
			
 
				+def isValidDate(year, month, day):
			
 
				+    try:
			
 
				+        date(year, month, day)
			
 
				+    except:
			
 
				+        return False
			
 
				+    else:
			
 
				+        return True
			
 
				+
			
 
				+time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
			
 
				+from BiddingKG.dl.ratio.re_ratio import getUnifyNum
			
 
				 def timeFormat(_time):
			
 
				     current_year = time.strftime("%Y",time.localtime())
			
 
				     all_match = re.finditer(time_format_pattern,_time)
			
@@ -622,22 +635,48 @@ def timeFormat(_time):
 
				                 if k=="day":
			
 
				                     day = v
			
 
				             if year!="":
			
 
				-                if len(year)==2:
			
 
				-                    year = "20"+year
			
 
				-                if int(year)>int(current_year):
			
 
				-                    legal = False
			
 
				+                if re.search("^\d+$",year):
			
 
				+                    if len(year)==2:
			
 
				+                        year = "20"+year
			
 
				+                    if int(year)>int(current_year):
			
 
				+                        legal = False
			
 
				+                else:
			
 
				+                    _year = ""
			
 
				+                    for word in year:
			
 
				+                        if word == '0':
			
 
				+                            _year += word
			
 
				+                        else:
			
 
				+                            _year += str(getDigitsDic(word))
			
 
				+                    year = _year
			
 
				             else:
			
 
				                 legal = False
			
 
				             if month!="":
			
 
				-                if int(month)>12:
			
 
				-                    legal = False
			
 
				+                if re.search("^\d+$", month):
			
 
				+                    if int(month)>12:
			
 
				+                        legal = False
			
 
				+                else:
			
 
				+                    month = int(getUnifyNum(month))
			
 
				+                    if month>=1 and month<=12:
			
 
				+                        month = str(month)
			
 
				+                    else:
			
 
				+                        legal = False
			
 
				             else:
			
 
				                 legal = False
			
 
				             if day!="":
			
 
				-                if int(day)>31:
			
 
				-                    legal = False
			
 
				+                if re.search("^\d+$", day):
			
 
				+                    if int(day)>31:
			
 
				+                        legal = False
			
 
				+                else:
			
 
				+                    day = int(getUnifyNum(day))
			
 
				+                    if day >= 1 and day <= 31:
			
 
				+                        day = str(day)
			
 
				+                    else:
			
 
				+                        legal = False
			
 
				             else:
			
 
				                 legal = False
			
 
				+            # print(year,month,day)
			
 
				+            if not isValidDate(int(year),int(month),int(day)):
			
 
				+                legal = False
			
 
				             if legal:
			
 
				                 return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
			
 
				     return ""
			
@@ -797,23 +836,28 @@ def uniform_package_name(package_name):
 
				     '''
			
 
				     package_name_raw = package_name
			
 
				     package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name)
			
 
				+    package_name = package_name.replace('标段（包）', '标段').replace('№', '')
			
 
				+    package_name = re.sub('\[|【', '', package_name)
			
 
				     kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name)
			
 
				     name = ""
			
 
				     if kw:
			
 
				         name += kw.group(0)
			
 
				     if re.search('^[a-zA-Z0-9-]{5,}$', package_name):   # 五个字符以上编号
			
 
				         _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
			
 
				+        # print('规范化包号1', _digit)
			
 
				         name += _digit
			
 
				     elif re.search('(?P<eng>[a-zA-Z])包[：）]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
			
 
				         ser = re.search('(?P<eng>[a-zA-Z])包[：）]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)
			
 
				+        # print('规范化包号2', ser.group(0))
			
 
				         _char = ser.groupdict().get('eng')
			
 
				         if _char:
			
 
				             _char = _char.upper()
			
 
				         _digit = ser.groupdict().get('num')
			
 
				         _digit = uniform_num(_digit)
			
 
				         name += _char.upper() + _digit
			
 
				-    elif re.search('第?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|([分子]?[包标]))', package_name): # 处理类似 A包2标段
			
 
				-        ser = re.search('第?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|([分子]?[包标]))', package_name)
			
 
				+    elif re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name): # 处理类似 A包2标段
			
 
				+        ser = re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name)
			
 
				+        # print('规范化包号3', ser.group(0))
			
 
				         _char = ser.groupdict().get('eng')
			
 
				         if _char:
			
 
				             _char = _char.upper()
			
@@ -822,8 +866,9 @@ def uniform_package_name(package_name):
 
				         if _char:
			
 
				             name += _char.upper()
			
 
				         name += _digit
			
 
				-    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name):  # 数字的统一的阿拉伯数字
			
 
				-        ser = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[a-zA-Z]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
			
 
				+    elif re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name):  # 数字的统一的阿拉伯数字
			
 
				+        ser = re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
			
 
				+        # print('规范化包号4', ser.group(0))
			
 
				         _char = ser.groupdict().get('eng')
			
 
				         if _char:
			
 
				             _char = _char.upper()
			
@@ -832,22 +877,28 @@ def uniform_package_name(package_name):
 
				         if _char:
			
 
				             name += _char.upper()
			
 
				         name += _digit
			
 
				-    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[a-zA-Z]{1,4})', package_name):  # 数字的统一的阿拉伯数字
			
 
				-        _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[a-zA-Z]{1,4})', package_name).group('eng').upper()
			
 
				+    elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[a-zA-Z-]{1,5})', package_name):  # 数字的统一的阿拉伯数字
			
 
				+        _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[:：]?(?P<eng>[a-zA-Z-]{1,5})', package_name).group('eng').upper()
			
 
				+        # print('规范化包号5', _digit)
			
 
				         name += _digit
			
 
				     elif re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name):  # 数字的统一的阿拉伯数字
			
 
				         _digit = re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper()
			
 
				+        # print('规范化包号6', _digit)
			
 
				         name += _digit
			
 
				     elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name):  # 数字的统一的阿拉伯数字
			
 
				         _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
			
 
				+        # print('规范化包号7', _digit)
			
 
				         _digit = uniform_num(_digit)
			
 
				         name += _digit
			
 
				     elif re.search('^[a-zA-Z0-9-]+$', package_name):
			
 
				         _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0)
			
 
				+        # print('规范化包号8', _char)
			
 
				         name += _char.upper()
			
 
				     if name == "":
			
 
				         return package_name_raw
			
 
				     else:
			
 
				+        if name.isdigit():
			
 
				+            name = str(int(name))
			
 
				         # print('原始包号：%s, 处理后：%s'%(package_name, name))
			
 
				         return name
			
 
				 
			
@@ -860,11 +911,13 @@ def money_process(money_text, header):
 
				     '''
			
 
				     money = 0
			
 
				     money_unit = ""
			
 
				-    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", money_text)
			
 
				+    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[（(]?万?", money_text)
			
 
				+    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[（(]?万?", money_text)
			
 
				     if re_price:
			
 
				         money_text = re_price.group(0)
			
 
				-        if '万元' in header and '万' not in money_text:
			
 
				+        if re.search('万元|[（(]万[)）]',  header) and '万' not in money_text:  # 修复37797825 控制价（万）
			
 
				             money_text += '万元'
			
 
				+        # money = float(getUnifyMoney(money_text))
			
 
				         money = float(getUnifyMoney(money_text))
			
 
				         if money > 10000000000000:  # 大于万亿的去除
			
 
				             money = 0
			
--- a/BiddingKG/dl/foolnltk/selffool/lexical.py
+++ b/BiddingKG/dl/foolnltk/selffool/lexical.py
@@ -100,7 +100,7 @@ class LexicalAnalyzer(object):
 
				                 lb = label.split("_")[0]
			
 
				 
			
 
				                 if lb == "S":
			
 
				-                    ens.append((i, i + 1, lt, word))
			
 
				+                    ens.append((i-1, i, lt, word))
			
 
				                 elif lb == "B":
			
 
				                     entity = ""
			
 
				                     entity += word
			
@@ -109,11 +109,11 @@ class LexicalAnalyzer(object):
 
				 
			
 
				                 elif lb == "E":
			
 
				                     entity += word
			
 
				-                    ens.append((i - len(entity), i + 1, lt, entity))
			
 
				+                    ens.append((i - len(entity), i, lt, entity))
			
 
				                     entity = ""
			
 
				 
			
 
				             if entity:
			
 
				-                ens.append((i - len(entity), i + 1, lt, entity))
			
 
				+                ens.append((i - len(entity), i, lt, entity))
			
 
				             all_entitys.append(ens)
			
 
				 
			
 
				         return all_entitys
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
--- a/BiddingKG/dl/interface/agency_set.pkl
+++ b/BiddingKG/dl/interface/agency_set.pkl
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -13,6 +13,7 @@ import os
 
				 import codecs
			
 
				 import requests
			
 
				 import time
			
 
				+from unicodedata import normalize
			
 
				 
			
 
				 _time1 = time.time()
			
 
				 sys.path.append(os.path.abspath("../.."))
			
@@ -114,11 +115,74 @@ def extractCount(extract_dict):
 
				         extract_count += 1
			
 
				     return extract_count
			
 
				 
			
 
				+# 字符编码标准化
			
 
				+def str_normalize(text):
			
 
				+    # time1 = time.time()
			
 
				+    cn_punctuation = "￥，｡：；｛｝！？（）＜"
			
 
				+    text_split = re.split("([{}])+".format(cn_punctuation),text)
			
 
				+    # print(text_split)
			
 
				+    new_text = ""
			
 
				+    for s in text_split:
			
 
				+        if re.search("^[{}]+$".format(cn_punctuation),s):
			
 
				+            new_text += s
			
 
				+        else:
			
 
				+            new_text += normalize('NFKD', s)
			
 
				+    # print("str_normalize cost time %s"%str(time.time()-time1))
			
 
				+    # print(new_text)
			
 
				+
			
 
				+    return new_text
			
 
				+# 修复prem中地区前缀不完整实体
			
 
				+def repair_entity(prem,district_dict,list_articles):
			
 
				+    district_dict = district_dict['district']
			
 
				+    province = district_dict['province'] if district_dict['province'] and district_dict['province'] not in ['未知','全国'] else ""
			
 
				+    city = district_dict['city'] if district_dict['city'] and district_dict['city']!='未知' else ""
			
 
				+    district = district_dict['district'] if district_dict['district'] and district_dict['district']!='未知' else ""
			
 
				+    content_text = list_articles[0].content
			
 
				+
			
 
				+    autonomous_region_dict = {
			
 
				+        "新疆":"新疆维吾尔",
			
 
				+        "西藏":"西藏",
			
 
				+        "内蒙古":"内蒙古",
			
 
				+        "广西":"广西壮族",
			
 
				+        "宁夏":"宁夏回族"
			
 
				+    }
			
 
				+
			
 
				+    for package,_prem in prem[0]['prem'].items():
			
 
				+        for role in _prem['roleList']:
			
 
				+            if role['role_name'] in ['tenderee','agency']:
			
 
				+                role_text = role['role_text']
			
 
				+                if re.search("^[省市县区]",role_text):
			
 
				+                    if role_text[0]=='省' and role_text[:2] not in ['省道']:
			
 
				+                        role['role_text'] = province + role_text
			
 
				+                    elif role_text[0]=='市' and role_text[:2] not in ['市政','市场']:
			
 
				+                        if district+'市' in content_text:
			
 
				+                            # 县级市
			
 
				+                            role['role_text'] = district + role_text
			
 
				+                        else:
			
 
				+                            role['role_text'] = city + role_text
			
 
				+                    elif role_text[0] in ['县','区']:
			
 
				+                        role['role_text'] = district + role_text
			
 
				+                elif re.search("^自治[区州县]",role_text):
			
 
				+                    if role_text[:3]=='自治区':
			
 
				+                        role['role_text'] = autonomous_region_dict.get(province,"") + role_text
			
 
				+                    elif role_text[:3] in ['自治县',"自治州"]:
			
 
				+                        if re.search("自治[县州]?$",district):
			
 
				+                            role['role_text'] = re.sub("自治[县州]?","",district) + role_text
			
 
				+                        elif re.search("族$",district):
			
 
				+                            role['role_text'] = district + role_text
			
 
				+                        elif re.search("自治[县州]?$",city):
			
 
				+                            role['role_text'] = re.sub("自治[县州]?","",city) + role_text
			
 
				+                        elif re.search("族$",city):
			
 
				+                            role['role_text'] = city + role_text
			
 
				+
			
 
				+
			
 
				 def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',**kwargs):
			
 
				     cost_time = dict()
			
 
				 
			
 
				     start_time = time.time()
			
 
				     log("start process doc %s"%(str(doc_id)))
			
 
				+    # 字符编码标准化
			
 
				+    text = str_normalize(text)
			
 
				     list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
			
 
				     log("get preprocessed done of doc_id%s"%(doc_id))
			
 
				     cost_time["preprocess"] = round(time.time()-start_time,2)
			
@@ -209,11 +273,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     '''表格要素提取'''
			
 
				     table_prem = predictor.getPredictor("tableprem").predict(text, nlp_enterprise)
			
 
				+    # print('表格提取中标人：', table_prem)
			
 
				+    # print('原提取角色：', prem[0]['prem'])
			
 
				     if table_prem:
			
 
				         getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem)
			
 
				 
			
 
				     '''候选人提取'''
			
 
				     candidate_top3_prem, candidate_dic = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
			
 
				+    # print('表格提取候选人：', candidate_top3_prem)
			
 
				     getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem)
			
 
				 
			
 
				     '''获取联合体信息'''
			
@@ -259,14 +326,21 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
			
 
				     cost_time["district"] = round(time.time() - start_time, 2)
			
 
				 
			
 
				-    '''限制行业最高金额'''
			
 
				-    getAttributes.limit_maximum_amount(prem, industry)
			
 
				+    '''根据district提取结果修复实体'''
			
 
				+    repair_entity(prem,district,list_articles)
			
 
				+
			
 
				+    # '''限制行业最高金额'''
			
 
				+    # getAttributes.limit_maximum_amount(prem, industry) # 20230703取消，改为整合所有要素后面纠正
			
 
				 
			
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2023-01-06'}
			
 
				+    version_date = {'version_date': '2023-07-18'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
			
 
				+
			
 
				+    '''最终检查修正招标、中标金额'''
			
 
				+    getAttributes.limit_maximum_amount(data_res, list_entitys[0])
			
 
				+
			
 
				     data_res["doctitle_refine"] = doctitle_refine
			
 
				     data_res["nlp_enterprise"] = nlp_enterprise
			
 
				     data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment
			
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
--- a/BiddingKG/dl/interface/header_set.pkl
+++ b/BiddingKG/dl/interface/header_set.pkl
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -45,7 +45,7 @@ class Model_role_classify_word():
 
				         if USE_PAI_EAS:
			
 
				             lazyLoad = True
			
 
				         #self.model_role_file = os.path.abspath("../role/log/ep071-loss0.107-val_loss0.122-f10.956.h5")
			
 
				-        self.model_role_file = os.path.dirname(__file__)+"/../role/models/ep038-loss0.140-val_loss0.149-f10.947.h5"
			
 
				+        # self.model_role_file = os.path.dirname(__file__)+"/../role/models/ep038-loss0.140-val_loss0.149-f10.947.h5"
			
 
				         #self.model_role_file = os.path.abspath("../role/log/textcnn_ep017-loss0.088-val_loss0.125-f10.955.h5")
			
 
				         self.model_role = None
			
 
				         
			
@@ -64,9 +64,9 @@ class Model_role_classify_word():
 
				               
			
 
				               input0 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
			
 
				               input1 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
			
 
				-              input2 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
			
 
				+              # input2 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
			
 
				               output = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
			
 
				-              self.model_role = [[input0,input1,input2],output]
			
 
				+              self.model_role = [[input0,input1],output]  #,input2
			
 
				         return self.model_role
			
 
				     '''
			
 
				     def load_weights(self):
			
@@ -75,9 +75,9 @@ class Model_role_classify_word():
 
				     '''
			
 
				     
			
 
				     def encode(self,tokens,begin_index,end_index,entity_text,**kwargs):
			
 
				-        _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=12,center_include=True,word_flag=True,text=entity_text)
			
 
				+        _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=20,center_include=False,word_flag=True,text=entity_text) #size=12 center_include=True
			
 
				         # print(_span)
			
 
				-        _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False)
			
 
				+        _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False) #  word_len=20
			
 
				         # print(_encode_span)
			
 
				         return _encode_span
			
 
				     
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
--- a/BiddingKG/dl/interface/product_savedmodel/productAndfailreason.pb
+++ b/BiddingKG/dl/interface/product_savedmodel/productAndfailreason.pb
--- a/BiddingKG/dl/interface/role_savedmodel/saved_model.pb
+++ b/BiddingKG/dl/interface/role_savedmodel/saved_model.pb
--- a/BiddingKG/dl/interface/role_savedmodel/variables/variables.data-00000-of-00001
+++ b/BiddingKG/dl/interface/role_savedmodel/variables/variables.data-00000-of-00001
--- a/BiddingKG/dl/interface/role_savedmodel/variables/variables.index
+++ b/BiddingKG/dl/interface/role_savedmodel/variables/variables.index
--- a/BiddingKG/dl/money/moneySource/ruleExtra.py
+++ b/BiddingKG/dl/money/moneySource/ruleExtra.py
@@ -10,8 +10,8 @@ def re_rule():
 
				     data = pd.read_csv("C:\\Users\\admin\\Desktop\\alldata_error.csv", index_col=0)
			
 
				 
			
 
				     rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来，?源[为于]?|来自于?)?(，|,|；+|：+)?)" 
			
 
				-              "(?P<moneySource>([^,，。;；已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
			
 
				-              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([:：.、\d]+%)?[，,;；]?)?([^,，.。;；已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
			
 
				+              "(?P<moneySource>([^,，。;；已]{,20}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|" 
			
 
				+              "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([:：.、\d]+%)?[，,;；]?)?([^,，.。;；已]{,20}(资本[金]|资金|自筹|贷款|补助|拨款|"
			
 
				                "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[:：.、\d]*%[，,;；]?)*)")
			
 
				     num = 0
			
 
				     moneySourceList = []
			
@@ -76,14 +76,14 @@ def re_rule():
 
				 
			
 
				 def extract_moneySource(text):
			
 
				     rule = re.compile("(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(于|是|为|来，?源[为于]?|来自于?)?(，|,|；+|：+)?)"
			
 
				-                      "(?P<moneySource>([^,，。;；已]*(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
			
 
				-                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([:：.、\d]+%)?[，,;；]?)?([^,，.。;；已]*(资本[金]|资金|自筹|贷款|补助|拨款|"
			
 
				+                      "(?P<moneySource>([^,，。;；已]{,30}(资本金?|资金|自筹|贷款|补助|拨款|财政|其[他它]|自行支付|成本|筹[集措]|"
			
 
				+                      "大修|拨付|国有|集体|工程款|自有|投资|国资|外资|解决)([:：.、\d]+%)?[，,;；]?)?([^,，.。;；已]{,30}(资本[金]|资金|自筹|贷款|补助|拨款|"
			
 
				                       "财政|其[他它]|自行支付|成本|筹[集措]|大修|拨付|国有|集体|工程款|自有|投资|国资|外资)[:：.、\d]*%[，,;；]?)*)")
			
 
				 
			
 
				     re1 = re.compile("(资[金佥]来[源自][^已]|建设资[金佥][^已]|项目资[金佥][^已，,。.;；]|资[金佥]性质)")
			
 
				     re2 = re.compile(r"(?P<start>(资[金佥]来[源自]|建设资[金佥]|项目资[金佥]|资[金佥]性质)(是|为|于|来，?源[为于]?|来自于?)?(，|,|；+|：+)?)"
			
 
				-                     r"(?P<moneySource>[^,，。;；已]{2,}?)[，。；,]")
			
 
				-    re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析")
			
 
				+                     r"(?P<moneySource>[^,，。;；已]{4,}?)[，。；,]")
			
 
				+    re_error = re.compile(r"核查|合法|紧张|null|分配|承诺|执行|已落实|已到位|批准|审计|调整|证明|监管|报告|完成|说明|使用|/|签订|规定|总价|未支付|主体|分析|是否")
			
 
				 
			
 
				     sub = re.compile("[：:。]|^[.,，、\)]|[,，;；]$|及采购预算|[及和]?其?落实情况|预算金额及服务期限|[1-9]、|及?招标控制价|"
			
 
				                      "及落实|及出资比例|及金额|及性质|项目出资比例|来源为|及来源|及最高限价|及构成|及项目投资估算额|及预算资金|已?落实|"
			
@@ -115,6 +115,7 @@ def extract_moneySource(text):
 
				                 # print(groupdict1)
			
 
				                 if source1:
			
 
				                     groupdict1["index"] = word_index
			
 
				+                    groupdict1["prob"] = 0.9
			
 
				                     # print(groupdict1['index'])
			
 
				                     results.append(groupdict1)
			
 
				             word_index += len(item)
			
@@ -127,8 +128,9 @@ def extract_moneySource(text):
 
				                     groupdict2 = res.groupdict()
			
 
				                     source2 = groupdict2['moneySource']
			
 
				                     # print("source2==>",source2)
			
 
				-                    if source2 and not re_error.search(source2):
			
 
				+                    if source2 and not re_error.search(res.group()):
			
 
				                         groupdict2["index"] = copy_index
			
 
				+                        groupdict2["prob"] = 0.8
			
 
				                         results.append(groupdict2)
			
 
				                 copy_index += len(item)
			
 
				     first = []
			
@@ -148,7 +150,7 @@ def extract_moneySource(text):
 
				     for result in first:
			
 
				         entity_text = sub.sub("",result['moneySource'])
			
 
				         # wordOffset_begin = result['index'] + re.search(entity_text,result['start']+result['moneySource']).start()
			
 
				-        if entity_text is None:
			
 
				+        if entity_text is None or len(entity_text)>40:
			
 
				             continue
			
 
				         else:
			
 
				             wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
			
@@ -158,6 +160,7 @@ def extract_moneySource(text):
 
				             _moneySource['body'] = entity_text
			
 
				             _moneySource['begin_index'] = wordOffset_begin
			
 
				             _moneySource['end_index'] = wordOffset_end
			
 
				+            _moneySource['prob'] = result['prob']
			
 
				             # print(_moneySource)
			
 
				             list_moneySource.append(_moneySource)
			
 
				     return list_moneySource
			
@@ -166,7 +169,9 @@ def extract_moneySource(text):
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     # re_rule()
			
 
				-    test ="a建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，as，建设资金来自呜呜呜。"
			
 
				+    test ="a建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，as，建设资金来自呜呜呜。" \
			
 
				+          "1、采购内容及资金来源：采购内容为汉上实验学校采购24台3匹柜机空调。资金来源为财政资金。"
			
 
				+    # test = "，资金来源是否都是要具体到每条来源明细，"
			
 
				     # 11,23 35,37
			
 
				-    extract_moneySource(test)
			
 
				+    print(extract_moneySource(test))
			
 
				     pass
			
--- a/BiddingKG/dl/product/data_tfrecord.py
+++ b/BiddingKG/dl/product/data_tfrecord.py
@@ -10,6 +10,21 @@ import os
 
				 import re
			
 
				 import collections
			
 
				 from BiddingKG.dl.product.data_util import word2id, max_id
			
 
				+import psycopg2
			
 
				+import json
			
 
				+import pickle
			
 
				+
			
 
				+conn = psycopg2.connect(host='192.168.2.103', port='5432', user='postgres', password='postgres', dbname='bid_validate')
			
 
				+cursor = conn.cursor()
			
 
				+def get_title(docid):
			
 
				+    sql = "select doctitle from qiao_ke_bao_raw where docid='{0}'".format(docid)
			
 
				+    cursor.execute(sql)
			
 
				+    for row in cursor.fetchall():
			
 
				+        return row[0]
			
 
				+    return ''
			
 
				+
			
 
				+product_notin = []
			
 
				+
			
 
				 max_len = 500
			
 
				 
			
 
				 def create_int_feature(values):
			
@@ -61,8 +76,11 @@ def fix_label_ner_句号分开(sentence, product_list, reasons_list):
 
				 
			
 
				 
			
 
				 def create_instances_from_document_句号分开(docid, document_text, product_list, reasons_list):
			
 
				-    for it in ['一','二','三','四','五','六','七','八','九','十','十一','十二','十三','十四','十五']:
			
 
				-        document_text = document_text.replace('，%s、'%it, '。%s、'%it)
			
 
				+    # for it in ['一','二','三','四','五','六','七','八','九','十','十一','十二','十三','十四','十五']:
			
 
				+    #     document_text = document_text.replace('，%s、'%it, '。%s、'%it)
			
 
				+    for it in re.finditer('[^\w\d。][一二三四五六七八九十]{1,3}、', document_text):
			
 
				+        t = it.group(0)
			
 
				+        document_text = document_text.replace(t, '。' + t[1:])
			
 
				 
			
 
				     if docid in ['docid']:
			
 
				         pass
			
@@ -137,6 +155,10 @@ def fix_label_ner(sentence, product_list, reasons_list):
 
				     tag_list = ['S'] * len(sentence)
			
 
				     word_list = list(sentence)
			
 
				     for product in product_list:
			
 
				+        if len(re.sub('[^\w]', '', product))<1:
			
 
				+            print('错误产品: ', product)
			
 
				+            continue
			
 
				+
			
 
				         b = sentence.find(product)
			
 
				         while b != -1:
			
 
				             e = b + len(product)
			
@@ -158,10 +180,97 @@ def fix_label_ner(sentence, product_list, reasons_list):
 
				             b = sentence.find(reason, e)
			
 
				     return tag_list, word_list
			
 
				 
			
 
				+def fix_label_ner_remove_punctuation(sentence, product_list, reasons_list):
			
 
				+    tag_list = ['S'] * len(sentence)
			
 
				+    word_list = list(sentence)
			
 
				+    if len(product_list)>0:
			
 
				+        for it in re.finditer('|'.join(product_list), sentence):
			
 
				+            b, e = it.span()
			
 
				+            if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
			
 
				+                tag_list[b] = 'B-pro'
			
 
				+                tag_list[e - 1] = 'E-pro'
			
 
				+                for i in range(b + 1, e - 1):
			
 
				+                    tag_list[i] = 'I-pro'
			
 
				+
			
 
				+    for reason in reasons_list:
			
 
				+        b = sentence.find(reason)
			
 
				+        while b != -1:
			
 
				+            e = b + len(reason)
			
 
				+            if tag_list[b] == 'S' and tag_list[e - 1] == 'S':
			
 
				+                tag_list[b] = 'B-rea'
			
 
				+                tag_list[e - 1] = 'E-rea'
			
 
				+                for i in range(b + 1, e - 1):
			
 
				+                    tag_list[i] = 'I-rea'
			
 
				+            b = sentence.find(reason, e)
			
 
				+    return tag_list, word_list
			
 
				+
			
 
				+def create_instances_from_document_remove_punctuation(docid, document_text, product_list, reasons_list):
			
 
				+    product_list = set([re.sub('[^\w]', '', it) for it in product_list if len(re.sub('[^\w]', '', it))>1])  # 产品字段去掉符号
			
 
				+    reasons_list = set([re.sub('[^\w]', '', it) for it in reasons_list if len(re.sub('[^\w]', '', it))>1])
			
 
				+    document_text = re.sub('[^\w]', '', document_text)
			
 
				+
			
 
				+    product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
			
 
				+    reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
			
 
				+    kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?(原因|理由)', document_text)
			
 
				+    if reasons_list == [] and kw_re:
			
 
				+        document_text = re.sub('(流标|废标|终止|中止|失败|异常)的?(原因|理由).{, 30}', '', document_text)
			
 
				+
			
 
				+    pos = []
			
 
				+    neg = []
			
 
				+    if len(document_text)<= max_len:
			
 
				+        document_text = document_text[:max_len]
			
 
				+        tag_list, word_list = fix_label_ner_remove_punctuation(document_text, product_list, reasons_list)
			
 
				+        if len(reasons_list)>0 and 'B-rea' not in tag_list:
			
 
				+            print("少于%d字的文章废标原因标注未找到：%s"%(max_len, docid))
			
 
				+        instance = TrainingInstance(word_list, tag_list)
			
 
				+        if 'B-pro' in tag_list or 'E-rea' in tag_list:
			
 
				+            pos.append(instance)
			
 
				+        else:
			
 
				+            neg.append(instance)
			
 
				+    elif len(reasons_list)>0:
			
 
				+        b = document_text.find(reasons_list[0])
			
 
				+        if b != -1:
			
 
				+            document_text = document_text[max(0, b-8):][:max_len]
			
 
				+        else:
			
 
				+            document_text = document_text[:max_len]
			
 
				+            print("多于%d字的文章废标原因标注未找到：%s," % (max_len, docid))
			
 
				+        tag_list, word_list = fix_label_ner_remove_punctuation(document_text, product_list, reasons_list)
			
 
				+        if 'E-rea' not in tag_list:
			
 
				+            print("文章废标原因标注未找到：%s, 开始位置：%d"%(docid, b))
			
 
				+        instance = TrainingInstance(word_list, tag_list)
			
 
				+        if 'B-pro' in tag_list or 'B-rea' in tag_list:
			
 
				+            pos.append(instance)
			
 
				+        else:
			
 
				+            neg.append(instance)
			
 
				+    else:
			
 
				+        epoch = len(document_text)//max_len
			
 
				+        if len(document_text)%max_len > 50:
			
 
				+            epoch += 1
			
 
				+        for i in range(epoch):
			
 
				+            sentence = document_text[i*max_len: (i+1)*max_len]
			
 
				+            if len(sentence)<5:
			
 
				+                # print("句子长度小于5")
			
 
				+                # print(sentence)
			
 
				+                continue
			
 
				+            sentence = sentence[:max_len]
			
 
				+            tag_list, word_list = fix_label_ner_remove_punctuation(sentence, product_list, reasons_list)
			
 
				+            instance = TrainingInstance(word_list, tag_list)
			
 
				+            if 'B-pro' in tag_list or 'B-rea' in tag_list:
			
 
				+                pos.append(instance)
			
 
				+            else:
			
 
				+                neg.append(instance)
			
 
				+    random.shuffle(neg)
			
 
				+    # neg = neg[:min(5, 10*len(pos))]
			
 
				+    neg = neg[:min(5, 2*len(pos))]
			
 
				+    instances = pos+neg
			
 
				+    random.shuffle(instances)
			
 
				+    return instances
			
 
				+
			
 
				 def create_instances_from_document(docid, document_text, product_list, reasons_list):
			
 
				     product_list = sorted(product_list, key=lambda x:len(x), reverse=True)
			
 
				     reasons_list = sorted(reasons_list, key=lambda x:len(x), reverse=True)
			
 
				     kw_re = re.search('(流标|废标|终止|中止|失败|异常)的?原因', document_text)
			
 
				+
			
 
				     if reasons_list == [] and kw_re:
			
 
				         kw = kw_re.group(0)
			
 
				         idx = document_text.find(kw)
			
@@ -196,10 +305,13 @@ def create_instances_from_document(docid, document_text, product_list, reasons_l
 
				         else:
			
 
				             neg.append(instance)
			
 
				     else:
			
 
				-        for it in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '十三', '十四', '十五']:
			
 
				-            document_text = document_text.replace('，%s、' % it, '。%s、' % it)
			
 
				+        # for it in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二', '十三', '十四', '十五']:
			
 
				+        #     document_text = document_text.replace('，%s、' % it, '。%s、' % it)
			
 
				+        for it in re.finditer('[^\w\d][一二三四五六七八九十]{1,3}、', document_text):
			
 
				+            t = it.group(0)
			
 
				+            document_text = document_text.replace(t, '。' + t[1:])
			
 
				         for sentence in document_text.split('。'):
			
 
				-            if len(sentence)<2:
			
 
				+            if len(sentence)<5:
			
 
				                 # print("句子长度小于5")
			
 
				                 # print(sentence)
			
 
				                 continue
			
@@ -249,7 +361,8 @@ def create_instances_from_document(docid, document_text, product_list, reasons_l
 
				                 else:
			
 
				                     neg.append(instance)
			
 
				     random.shuffle(neg)
			
 
				-    neg = neg[:min(5, 10*len(pos))]
			
 
				+    # neg = neg[:min(5, 10*len(pos))]
			
 
				+    neg = neg[:min(5, 2*len(pos))]
			
 
				     instances = pos+neg
			
 
				     random.shuffle(instances)
			
 
				     return instances
			
@@ -259,37 +372,92 @@ def create_training_instances(df):
 
				     # df = pd.read_excel(xlsx)
			
 
				     df.fillna('', inplace=True)
			
 
				     for i in df.index:
			
 
				-        try:
			
 
				-            docid = df.loc[i, 'docid']
			
 
				-            document_text = df.loc[i, 'text']
			
 
				-            product_list = json.loads(df.loc[i, 'lbset'])
			
 
				-            reasons_list = json.loads(df.loc[i, 'reasons_list'])
			
 
				-            # if reasons_list == []:
			
 
				-            #     continue
			
 
				-            instances.extend(
			
 
				-                create_instances_from_document(
			
 
				-                    docid, document_text, product_list, reasons_list
			
 
				-                ))
			
 
				-        except Exception as e:
			
 
				-            print('json出错',i,  df.loc[i, 'lbset'], type(df.loc[i, 'lbset']), e)
			
 
				+        if i % 5000==0:
			
 
				+            print('create_instance', i)
			
 
				+        # try:
			
 
				+        docid = df.loc[i, 'docid']
			
 
				+        document_text = df.loc[i, 'text']
			
 
				+        product_list = json.loads(df.loc[i, 'lbset'])
			
 
				+        reasons_list = json.loads(df.loc[i, 'reasons_list'])
			
 
				+
			
 
				+        notin_num = 0
			
 
				+        for i in range(len(product_list)):  # 如果在公告找不到产品，尝试在中间加标点符号，
			
 
				+            p = product_list[i]
			
 
				+            if re.search('[^\w]', p) == None and re.search(p, document_text) == None:
			
 
				+                ser = re.search('[^\w]{,2}'.join(p), document_text)
			
 
				+                if ser:
			
 
				+                    product_list[i] = ser.group(0)
			
 
				+                elif '项目' in p and re.search(p.replace('项目', '采购项目'), document_text):
			
 
				+                    product_list[i] = p.replace('项目', '采购项目')
			
 
				+                elif '项目' in p and re.search(p.replace('项目', ''), document_text):
			
 
				+                    product_list[i] = p.replace('项目', '')
			
 
				+                elif re.search('[a-zA-Z]', p) and re.search(p.lower(), document_text):
			
 
				+                    product_list[i] = p.lower()
			
 
				+                elif re.search('[a-zA-Z]', p) and re.search(p.upper(), document_text.upper()):
			
 
				+                    product_list[i] = p.upper()
			
 
				+                    document_text = document_text.upper()
			
 
				+                else:
			
 
				+                    title = get_title(docid)
			
 
				+                    if title not in document_text:
			
 
				+                        document_text = title + "。" + document_text
			
 
				+                        ser = re.search('[^\w]{,2}'.join(p), document_text)
			
 
				+                        if ser:
			
 
				+                            product_list[i] = ser.group(0)
			
 
				+                        elif '项目' in p and re.search(p.replace('项目', '采购项目'), document_text):
			
 
				+                            product_list[i] = p.replace('项目', '采购项目')
			
 
				+                        elif '项目' in p and re.search(p.replace('项目', ''), document_text):
			
 
				+                            product_list[i] = p.replace('项目', '')
			
 
				+                        elif re.search('[a-zA-Z]', p) and re.search(p.lower(), document_text):
			
 
				+                            product_list[i] = p.lower()
			
 
				+                        elif re.search('[a-zA-Z]', p) and re.search(p.upper(), document_text.upper()):
			
 
				+                            product_list[i] = p.upper()
			
 
				+                            document_text = document_text.upper()
			
 
				+                        else:
			
 
				+                            # print('docid：%s，not in text product: %s' % (docid, p))
			
 
				+                            notin_num += 1
			
 
				+                            if re.search('业绩', document_text) == None:
			
 
				+                                product_notin.append((docid, p))
			
 
				+                    else:
			
 
				+                        # print('docid：%s，not in text product: %s'%(docid, p))
			
 
				+                        notin_num +=1
			
 
				+                        if re.search('业绩', document_text) == None:
			
 
				+                            product_notin.append((docid, p))
			
 
				+        if notin_num > len(product_list)/2:
			
 
				+            print('找到的产品少于一半，过滤掉', docid, product_list)
			
 
				+            continue
			
 
				+
			
 
				+        # if reasons_list == []:
			
 
				+        #     continue
			
 
				+        instances.extend(
			
 
				+            create_instances_from_document(
			
 
				+                docid, document_text, product_list, reasons_list
			
 
				+            ))
			
 
				+        # instances.extend(
			
 
				+        #     create_instances_from_document_remove_punctuation(
			
 
				+        #         docid, document_text, product_list, reasons_list
			
 
				+        #     ))
			
 
				+        # except Exception as e:
			
 
				+        #     print('json出错',i,  df.loc[i, 'lbset'], type(df.loc[i, 'lbset']), e)
			
 
				     return instances
			
 
				 
			
 
				-def write_instance_to_example_files(instances, word2index, tag2index, output_dir):
			
 
				+def write_instance_to_example_files(instances, word2index, tag2index, output_dir, tfrecode_name):
			
 
				     # writers = []
			
 
				     # instances = sorted(instances, key=lambda x: len(x.word_list))
			
 
				     i = 0
			
 
				     # for max_len in [200, 500, 1000]:
			
 
				-    writer = tf.python_io.TFRecordWriter(output_dir + '/maxlen_%s_addunk_product_reason.tfrecode'%max_len)
			
 
				+    # writer = tf.python_io.TFRecordWriter(output_dir + '/maxlen_%s_addunk_product_reason.tfrecode'%max_len)
			
 
				+    writer = tf.python_io.TFRecordWriter(output_dir + '/%s'%tfrecode_name)
			
 
				     # print('排序前：', [len(x.word_list) for x in instances[:5]])
			
 
				     # instances.sort(key=lambda x:len(x.word_list), reverse=True)
			
 
				     # print('排序后：', [len(x.word_list) for x in instances[:5]])
			
 
				     while i < len(instances):
			
 
				+        if i % 5000 == 0:
			
 
				+            print('开始写入', i)
			
 
				         instance = instances[i]
			
 
				         if len(instance.word_list)>max_len:
			
 
				             writer.close()
			
 
				             break
			
 
				         i += 1
			
 
				-        # word_ids = [word2index.get(word, max_id) for word in instance.word_list]
			
 
				         word_ids = [word2index.get(word, word2index.get('<unk>')) for word in instance.word_list]
			
 
				         tag_ids = [tag2index.get(tag, 0) for tag in instance.tag_list]
			
 
				         while len(word_ids)<max_len:
			
@@ -303,42 +471,117 @@ def write_instance_to_example_files(instances, word2index, tag2index, output_dir
 
				         writer.write(tf_example.SerializeToString())
			
 
				     writer.close()
			
 
				 
			
 
				+def 去除标注不在公告里面的公告(df):
			
 
				+    df['notin'] = df.apply(
			
 
				+        lambda x: json.dumps([it for it in json.loads(x['lbset']) if re.sub('[^\w]', '', it) not in re.sub('[^\w]', '', x['text'])],
			
 
				+                             ensure_ascii=False), axis=1)
			
 
				+    df = df[df['notin']=='[]']
			
 
				+    return df
			
 
				+
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				-    df = pd.read_excel(os.path.dirname(__file__) + '/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
			
 
				-    df['pos'] = df.apply(lambda x:1 if re.search('(流标|废标|终止|中止|失败|异常)(公告|公示)', x['text']) and x['reasons_list']=='[]' else 0, axis=1)
			
 
				-    df = df[df.loc[:, 'pos']==0]  # 过滤掉未标注废标原因文章
			
 
				-    df.reset_index(drop=True, inplace=True)
			
 
				-    print('总文章数：',len(df))
			
 
				-    df.fillna('', inplace=True)
			
 
				-    print('读取完毕')
			
 
				-    df['lbs'] = df['lbset'].apply(lambda x: json.loads(x))
			
 
				-    lbset = [it for l in df['lbs'] for it in l]
			
 
				-    c = collections.Counter(lbset)
			
 
				-    m = c.most_common()
			
 
				-    m3 = [it[0] for it in m if it[1] > 2]
			
 
				-    df['pos'] = df['lbs'].apply(lambda x: 1 if len(set(m3) & set(x)) >= 1 else 0)
			
 
				-    df_dev = df[df.loc[:, 'pos'] == 1].sample(frac=0.1, random_state=8)
			
 
				-    print('len_df_dev:', len(df_dev))
			
 
				-    df_reason = df[df.loc[:, 'reasons_list'] != '[]'].sample(frac=0.1, random_state=8)
			
 
				-    print('len(df_reason)', len(df_reason))
			
 
				-    df_dev.append(df_reason)
			
 
				-    df_dev.drop_duplicates(subset=['docid'], inplace=True)
			
 
				-    print('len_df_dev:', len(df_dev))
			
 
				-    df_train = df[~df.index.isin(df_dev.index)]
			
 
				-    print(len(df), len(df_dev), len(df_train))
			
 
				-    df_train = df_train.sample(frac=1)
			
 
				-    df_dev = df_dev.sample(frac=1)
			
 
				-
			
 
				-    # file = 'data/traindata.xlsx'
			
 
				+    # df = pd.read_excel(os.path.dirname(__file__) + '/data/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
			
 
				+
			
 
				+    # df = pd.read_excel('E:/产品及失败原因标注数据/所有产品标注数据筛选20211125_ProductAndReason.xlsx')
			
 
				+    # # tfrecode_name = '20211125_ProductAndReason.tfrecode'
			
 
				+    # df = df[['docid', 'text', 'lbset', 'reasons_list']]
			
 
				+    #
			
 
				+    # df1 = pd.read_excel('E:/产品及失败原因标注数据/桥客宝产品数据1.xlsx')
			
 
				+    #
			
 
				+    # # tfrecode_name = 'qiaokebao1_product.tfrecode'
			
 
				+    # df2 = pd.read_csv('E:/产品及失败原因标注数据/桥客宝产品数据2.csv')
			
 
				+    #
			
 
				+    # # tfrecode_name = 'qiaokebao2_product.tfrecode'
			
 
				+    # df3 = pd.read_csv('E:/产品及失败原因标注数据/桥客宝产品数据3.csv')
			
 
				+    # df = df.append([df1, df2, df3], ignore_index=True)
			
 
				+    #
			
 
				+    # tfrecode_name = 'all_product.tfrecode'
			
 
				+    #
			
 
				+    # df = df[['docid', 'text', 'lbset', 'reasons_list']]
			
 
				+    # df.fillna('', inplace=True)
			
 
				+    # df['pos'] = df.apply(lambda x:1 if re.search('(流标|废标|终止|中止|失败|异常)(公告|公示)', x['text']) and x['reasons_list']=='[]' else 0, axis=1)
			
 
				+    # df = df[df.loc[:, 'pos']==0]  # 过滤掉未标注废标原因文章
			
 
				+    # df.reset_index(drop=True, inplace=True)
			
 
				+    # print('总文章数：',len(df))
			
 
				+    # df.fillna('', inplace=True)
			
 
				+    # print('读取完毕')
			
 
				+    # df['lbs'] = df['lbset'].apply(lambda x: json.loads(x))
			
 
				+    # lbset = [it for l in df['lbs'] for it in l]
			
 
				+    # c = collections.Counter(lbset)
			
 
				+    # m = c.most_common()
			
 
				+    # m3 = [it[0] for it in m if it[1] > 2]
			
 
				+    # print('m3: ', m3[:20])
			
 
				+    # df['pos'] = df['lbs'].apply(lambda x: 1 if len(set(m3) & set(x)) >= 1 else 0)
			
 
				+    # print('sum(pos): ', sum(df['pos']))
			
 
				+    # df_dev = df[df.loc[:, 'pos'] == 1].sample(frac=0.1, random_state=8)
			
 
				+    # print('len_df_dev:', len(df_dev))
			
 
				+    #
			
 
				+    # df_reason = df[df.loc[:, 'reasons_list'] != '[]']
			
 
				+    # if len(df_reason)>10:
			
 
				+    #     df_reason = df_reason.sample(frac=0.1, random_state=8)
			
 
				+    #     print('len(df_reason)', len(df_reason))
			
 
				+    #     df_dev.append(df_reason)
			
 
				+    # df_dev.drop_duplicates(subset=['docid'], inplace=True)
			
 
				+    # print('len_df_dev:', len(df_dev))
			
 
				+    #
			
 
				+    # df_train = df[~df.index.isin(df_dev.index)]
			
 
				+    # print(len(df), len(df_dev), len(df_train))
			
 
				+    # df_train = df_train.sample(frac=1)
			
 
				+    # df_dev = df_dev.sample(frac=1)
			
 
				+
			
 
				+
			
 
				+    df_train = pd.read_csv('E:/产品及失败原因标注数据/df_train.csv')
			
 
				+    print('读取完毕',len(df_train))
			
 
				+    sp = len(df_train)//2
			
 
				+    df_train = df_train[:sp]
			
 
				+    tfrecode_name = 'ProductAndReason_2023-02-24_train1.tfrecode'
			
 
				+    # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_train1.tfrecode'
			
 
				     instances = create_training_instances(df_train)
			
 
				+    del df_train
			
 
				     # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
			
 
				     tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
			
 
				     output_dir = 'data/train_data'
			
 
				-    write_instance_to_example_files(instances, word2id, tag2index, output_dir)
			
 
				+    print('准备写入')
			
 
				+    write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
			
 
				+    print('完成1')
			
 
				+    with open('E:\产品及失败原因标注数据/product_notin1.pkl', 'wb') as f:
			
 
				+        pickle.dump(product_notin, f)
			
 
				 
			
 
				+    df_train = pd.read_csv('E:/产品及失败原因标注数据/df_train.csv')
			
 
				+    print('读取完毕', len(df_train))
			
 
				+    sp = len(df_train)//2
			
 
				+    df_train = df_train[sp:]
			
 
				+    tfrecode_name = 'ProductAndReason_2023-02-24_train2.tfrecode'
			
 
				+    # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_train2.tfrecode'  # 去掉文本及产品里面的符号
			
 
				+    instances = create_training_instances(df_train)
			
 
				+    del df_train
			
 
				+    # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
			
 
				+    tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
			
 
				+    output_dir = 'data/train_data'
			
 
				+    print('准备写入')
			
 
				+    write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
			
 
				+    print('完成2')
			
 
				+    with open('E:\产品及失败原因标注数据/product_notin2.pkl', 'wb') as f:
			
 
				+        pickle.dump(product_notin, f)
			
 
				+
			
 
				+    df_dev = pd.read_csv('E:/产品及失败原因标注数据/df_dev.csv')
			
 
				+
			
 
				+    print('去除前', len(df_dev))
			
 
				+    # df_dev = 去除标注不在公告里面的公告(df_dev)
			
 
				+    # print('去除后', len(df_dev))
			
 
				+    #
			
 
				+    tfrecode_name = 'ProductAndReason_2023-02-24_dev.tfrecode'
			
 
				+    # tfrecode_name = 'ProductAndReason_2023-03-30_remove_punctuation_dev.tfrecode'
			
 
				     instances = create_training_instances(df_dev)
			
 
				+    del df_dev
			
 
				     # word2index = {'<unk>':0, '我':1, '们':2, '中':3, '国':4, '人':5}
			
 
				     tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
			
 
				     output_dir = 'data/test_data'
			
 
				-    write_instance_to_example_files(instances, word2id, tag2index, output_dir)
			
 
				-    print('全部写入成功！')
			
 
				+    write_instance_to_example_files(instances, word2id, tag2index, output_dir, tfrecode_name)
			
 
				+    print('全部写入成功！')
			
 
				+    with open('E:\产品及失败原因标注数据/product_notin3.pkl', 'wb') as f:
			
 
				+        pickle.dump(product_notin, f)
			
 
				+
			
 
				+
			
 
				+cursor.close()
			
 
				+conn.close()
			
--- a/BiddingKG/dl/product/data_util.py
+++ b/BiddingKG/dl/product/data_util.py
@@ -11,28 +11,16 @@ import numpy as np
 
				 import pandas as pd
			
 
				 from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word,viterbi_decode, load
			
 
				 
			
 
				-tag2index = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
			
 
				-id_to_tag = {v:k for k,v in tag2index.items()}
			
 
				-# id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
			
 
				+tag2id = {'S':0,'B-pro':1, 'I-pro':2, 'E-pro':3, 'B-rea':4, 'I-rea':5, 'E-rea':6}
			
 
				+id_to_tag = {v:k for k,v in tag2id.items()}
			
 
				 
			
 
				-word_model = getModel_word()
			
 
				-vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
			
 
				-word2id = {k: v for v, k in enumerate(vocab)}
			
 
				+path1 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_vocab.pk"
			
 
				+path2 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_w2v_matrix.pk"
			
 
				+vocab = load(path1)
			
 
				+matrix = load(path2)
			
 
				 max_id = len(vocab)
			
 
				+word2id = {k: v for v, k in enumerate(vocab)}
			
 
				 
			
 
				-# path1 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_vocab.pk"
			
 
				-# path2 = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))+"/interface/codename_w2v_matrix.pk"
			
 
				-# vocab = load(path1)
			
 
				-# matrix = load(path2)
			
 
				-# max_id = len(vocab)
			
 
				-# word2id = {k: v for v, k in enumerate(vocab)}
			
 
				-
			
 
				-# vocab = ["<pad>"] + word_model.index2word+ ["<unk>"]
			
 
				-# matrix = np.zeros((len(vocab), 60))
			
 
				-# for i in range(1, len(vocab)-1):
			
 
				-#     matrix[i] = word_model[vocab[i]]
			
 
				-# max_id = len(vocab)
			
 
				-# word2id = {k: v for v, k in enumerate(vocab)}
			
 
				 
			
 
				 def df2data(df):
			
 
				     import pandas as pd
			
@@ -211,8 +199,8 @@ def process_data(sentences):
 
				     :return: 数字化后的统一长度
			
 
				     '''
			
 
				     maxLen = max([len(sentence) for sentence in sentences])
			
 
				-    tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
			
 
				-    # tags = [[word2id.get(k, word2id.get('<unk>')) for k in sentence] for sentence in sentences]
			
 
				+    # tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
			
 
				+    tags = [[word2id.get(k, word2id.get('<unk>')) for k in sentence] for sentence in sentences]
			
 
				     pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
			
 
				     return pad_tags
			
 
				 
			
@@ -225,8 +213,8 @@ def get_ner(BIE_tag):
 
				 def decode(logits, lengths, matrix):
			
 
				     paths = []
			
 
				     small = -1000.0
			
 
				-    start = np.asarray([[small]*4+[0]])
			
 
				-    # start = np.asarray([[small]*7+[0]])
			
 
				+    # start = np.asarray([[small]*4+[0]]) # 只有产品
			
 
				+    start = np.asarray([[small]*7+[0]]) # 产品及失败原因
			
 
				     for score, length in zip(logits, lengths):
			
 
				         score = score[:length]
			
 
				         pad = small * np.ones([length, 1])
			
--- a/BiddingKG/dl/product/main.py
+++ b/BiddingKG/dl/product/main.py
@@ -116,7 +116,8 @@ def save_model_pb():
 
				     #
			
 
				     # 把cpkt转为pb
			
 
				 
			
 
				-    input_checkpoint = "model/ner_epoch5_f10.6855_loss1.3800.ckpt"
			
 
				+    # input_checkpoint = "model/ner_epoch5_f10.6855_loss1.3800.ckpt"
			
 
				+    input_checkpoint = "model/ner_epoch22_f10.7923_loss1.1039.ckpt" #2023/4/6
			
 
				     saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True)
			
 
				     graph = tf.get_default_graph()  # 获得默认的图
			
 
				     input_graph_def = graph.as_graph_def()  # 返回一个序列号
			
--- a/BiddingKG/dl/product/predict.py
+++ b/BiddingKG/dl/product/predict.py
@@ -0,0 +1,79 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+"""
			
 
				+@author: bidikeji
			
 
				+@time: 2023/3/27 10:19
			
 
				+"""
			
 
				+from BiddingKG.dl.product.product_model import Product_Model
			
 
				+import os
			
 
				+import re
			
 
				+import time
			
 
				+import pandas as pd
			
 
				+import tensorflow as tf
			
 
				+os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
			
 
				+def predict():
			
 
				+    ckpt_path = "model"
			
 
				+    import json
			
 
				+    with tf.Session() as sess:
			
 
				+        model = Product_Model()
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+        ckpt = tf.train.get_checkpoint_state(ckpt_path)
			
 
				+
			
 
				+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch16_f10.8000_loss1.0775.ckpt')
			
 
				+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch7_f10.7998_loss1.0508.ckpt')
			
 
				+        model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch22_f10.7923_loss1.1039.ckpt') # 整理数据后再次训练
			
 
				+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch18_f10.8000_loss1.1276.ckpt') # 新
			
 
				+        # model.saver.restore(sess, os.path.dirname(__file__) + '/model/ner_epoch5_f10.6855_loss1.3800.ckpt') # 旧
			
 
				+        t1 = time.time()
			
 
				+
			
 
				+        print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
			
 
				+        # df = pd.read_csv(os.path.dirname(__file__) + '/data/df_test.csv') #../test/
			
 
				+        df = pd.read_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx')
			
 
				+        print('公告数量：', len(df))
			
 
				+        df.fillna('', inplace=True)
			
 
				+        # df = pd.read_excel('data/所有产品标注数据筛选测试数据2021-12-01_pred.xlsx')
			
 
				+        df.reset_index(drop=True, inplace=True)
			
 
				+        rs = []
			
 
				+        for i in df.index:
			
 
				+            text = df.loc[i, 'text']
			
 
				+            # result = model.evaluate_line(sess, text)
			
 
				+            # print(result[0][1])
			
 
				+            # rs.append(json.dumps(result[0][1], ensure_ascii=False))
			
 
				+
			
 
				+            tmp = []
			
 
				+            for line in text.split('。'):
			
 
				+                # line = re.sub('[^\w]', '', line)
			
 
				+                # if len(line) < 5:
			
 
				+                #     continue
			
 
				+                result = model.evaluate_line(sess, line)
			
 
				+                # print(result[0][1])
			
 
				+                tmp.extend(result[0][1])
			
 
				+            rs.append(json.dumps(tmp, ensure_ascii=False))
			
 
				+        df['predict_new'] = pd.Series(rs)
			
 
				+        df.to_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx', index=False)
			
 
				+        print('耗时： ', time.time()-t1)
			
 
				+        return df
			
 
				+
			
 
				+def 统计准确率(df):
			
 
				+    import json
			
 
				+    # df = pd.read_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx')
			
 
				+    df['pr'] = df['predict_new'].apply(lambda x:set([it[0] for it in json.loads(x)]))
			
 
				+    df['lb'] = df['lbset'].apply(lambda x: set(json.loads(x)))
			
 
				+    df['pos'] = df.apply(lambda x:1 if x['pr']==x['lb'] else 0, axis=1)
			
 
				+    eq = lb = pr = 0
			
 
				+    for i in df.index:
			
 
				+        pred = df.loc[i, 'pr']
			
 
				+        label = df.loc[i, 'lb']
			
 
				+        lb += len(label)
			
 
				+        pr += len(pred)
			
 
				+        eq += len(pred&label)
			
 
				+    acc = eq/pr
			
 
				+    recall = eq/lb
			
 
				+    f1 = acc*recall*2/(acc+recall)
			
 
				+    print('准确率：%.4f，召回率：%.4f，F1：%.4f'%(acc, recall, f1))  # 准确率：0.6489，召回率：0.8402，F1：0.7323
			
 
				+    # df.to_excel(os.path.dirname(__file__) + '/data/df_test_pred.xlsx')
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    df = predict()
			
 
				+    统计准确率(df)
			
--- a/BiddingKG/dl/product/train.py
+++ b/BiddingKG/dl/product/train.py
@@ -9,6 +9,9 @@
 
				 # @Time    : 2021/1/13 0013 10:12
			
 
				 import os
			
 
				 import re
			
 
				+import time
			
 
				+import logging
			
 
				+logging.basicConfig(level=logging.DEBUG)
			
 
				 print('准备导入tf')
			
 
				 import tensorflow as tf
			
 
				 print('准备导入np')
			
@@ -19,11 +22,14 @@ print('准备导入max_len')
 
				 from BiddingKG.dl.product.data_tfrecord import max_len
			
 
				 # from BiddingKG.dl.common.Utils import viterbi_decode
			
 
				 print('准备设置CUDA环境')
			
 
				-os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
			
 
				+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
			
 
				 # max_len = 500
			
 
				+batch_size = 256
			
 
				+MIN_AFTER_DEQUEUE = batch_size*500
			
 
				 
			
 
				-def read_tfRecord(sess, file_tfRecord):
			
 
				-    queue = tf.train.string_input_producer([file_tfRecord])
			
 
				+def read_tfRecord(sess, file_list):
			
 
				+    # queue = tf.train.string_input_producer([file_tfRecord])
			
 
				+    queue = tf.train.string_input_producer(file_list)
			
 
				     reader = tf.TFRecordReader()
			
 
				     filename_, serialized_example = reader.read(queue)
			
 
				     features = tf.parse_single_example(
			
@@ -39,13 +45,13 @@ def read_tfRecord(sess, file_tfRecord):
 
				     text_len = tf.cast(features['text_len'], tf.int64)
			
 
				     return text_len, word_ids , tag_ids
			
 
				 
			
 
				-def get_batch_record(sess,filename, batch_size):
			
 
				-    text_len, word_ids, tag_ids = read_tfRecord(sess, filename)
			
 
				+def get_batch_record(sess,file_list, batch_size):
			
 
				+    text_len, word_ids, tag_ids = read_tfRecord(sess, file_list)
			
 
				     text_len, word_ids, tag_ids = tf.train.shuffle_batch([text_len, word_ids , tag_ids],
			
 
				                                                      batch_size=batch_size,
			
 
				-                                                     capacity=200+batch_size*3,
			
 
				-                                                     min_after_dequeue=1,
			
 
				-                                                     num_threads=5)
			
 
				+                                                     capacity=MIN_AFTER_DEQUEUE+batch_size*3,
			
 
				+                                                     min_after_dequeue=MIN_AFTER_DEQUEUE,
			
 
				+                                                     num_threads=8)
			
 
				     text_len = tf.squeeze(text_len, squeeze_dims=1)
			
 
				     return text_len, word_ids , tag_ids
			
 
				 
			
@@ -60,14 +66,36 @@ def total_sample(file_name):
 
				         sample_num += 1
			
 
				     return sample_num
			
 
				 
			
 
				-if __name__ == "__main__":
			
 
				-    print('进入main ')
			
 
				-    filename = os.path.dirname(__file__)+'/data/train_data/maxlen_500_addunk_product_reason.tfrecode'
			
 
				-    filename_dev = os.path.dirname(__file__)+'/data/test_data/maxlen_500_addunk_product_reason.tfrecode'
			
 
				-    assert os.path.exists(filename)
			
 
				-    assert os.path.exists(filename_dev)
			
 
				+def train():
			
 
				+    logging.info('进入main ')
			
 
				+    # filename = os.path.dirname(__file__)+'/data/train_data/maxlen_500_addunk_product_reason.tfrecode'
			
 
				+    # filename_dev = os.path.dirname(__file__)+'/data/test_data/maxlen_500_addunk_product_reason.tfrecode'
			
 
				+    # print('os.path.dirname(__file__): ', os.path.dirname(__file__))
			
 
				+    # print('filename path :', filename)
			
 
				+    # assert os.path.exists(filename)
			
 
				+    # assert os.path.exists(filename_dev)
			
 
				+
			
 
				+    file_list = []
			
 
				+    file_list_dev = []
			
 
				+    train1 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-02-24_train1.tfrecode'
			
 
				+    train2 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-02-24_train2.tfrecode'
			
 
				+    dev1 = os.path.dirname(__file__)+'/data/test_data/ProductAndReason_2023-02-24_dev.tfrecode'
			
 
				+
			
 
				+    # train1 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-03-30_remove_punctuation_train1.tfrecode'
			
 
				+    # train2 = os.path.dirname(__file__)+'/data/train_data/ProductAndReason_2023-03-30_remove_punctuation_train2.tfrecode'
			
 
				+    # dev1 = os.path.dirname(__file__)+'/data/test_data/ProductAndReason_2023-03-30_remove_punctuation_dev.tfrecode'
			
 
				+
			
 
				+    # print('filename path :', train1, os.path.exists(train1))
			
 
				+
			
 
				+    file_list.append(train1)
			
 
				+    file_list.append(train2)
			
 
				+
			
 
				+    file_list_dev.append(dev1)
			
 
				+
			
 
				+
			
 
				     print('确保文件存在')
			
 
				-    batch_size = 100
			
 
				+    print('filename path :', train1, os.path.exists(train1))
			
 
				+    # batch_size = 512
			
 
				     # id_to_tag = {0: 'O', 1: 'B', 2: 'I', 3: 'E'}
			
 
				     tag2index = {'S': 0, 'B-pro': 1, 'I-pro': 2, 'E-pro': 3, 'B-rea': 4, 'I-rea': 5, 'E-rea': 6}
			
 
				     id_to_tag = {v:k for k,v in tag2index.items()}
			
@@ -88,25 +116,28 @@ if __name__ == "__main__":
 
				         init_op = tf.global_variables_initializer()
			
 
				         sess.run(init_op)
			
 
				         print('参数初始化')
			
 
				-        text_len, word_ids, tag_ids = get_batch_record(sess, filename, batch_size=batch_size)
			
 
				+        text_len, word_ids, tag_ids = get_batch_record(sess, file_list, batch_size=batch_size)
			
 
				         print('get_batch_record')
			
 
				-        text_len_dev, word_ids_dev, tag_ids_dev = get_batch_record(sess, filename_dev, batch_size=batch_size)
			
 
				+        text_len_dev, word_ids_dev, tag_ids_dev = get_batch_record(sess, file_list_dev, batch_size=batch_size)
			
 
				         print('get_batch_record_dev')
			
 
				         coord = tf.train.Coordinator()
			
 
				         threads = tf.train.start_queue_runners(coord=coord)
			
 
				-        print('total_sample(filename)', total_sample(filename))
			
 
				 
			
 
				-        total_num = total_sample(filename)
			
 
				+        total_num = sum([total_sample(filename) for filename in file_list])
			
 
				+        logging.info('total_train_num: %d'%total_num)
			
 
				         batch_num = total_num//batch_size
			
 
				-        batch_num_dev = total_sample(filename_dev)//batch_size
			
 
				+        batch_num_dev = sum([total_sample(filename_dev) for filename_dev in file_list_dev])//batch_size
			
 
				         num = 0
			
 
				         l = []
			
 
				-        max_f1 = 0
			
 
				 
			
 
				+        max_f1 = 0.79
			
 
				         # model.saver.restore(sess, os.path.join(os.path.dirname(__file__)+'/model','ner_epoch10_f10.6875_loss1.5230.ckpt'))
			
 
				-        # print('模型加载成功')
			
 
				+        # model.saver.restore(sess, os.path.join(os.path.dirname(__file__)+'/model','ner_epoch0_f10.7740_loss1.2526.ckpt'))
			
 
				+        model.saver.restore(sess, os.path.join(os.path.dirname(__file__)+'/model','ner_epoch16_f10.8000_loss1.0775.ckpt'))
			
 
				+        print('模型加载成功')
			
 
				 
			
 
				-        for epoch in range(50):
			
 
				+        for epoch in range(20,50):
			
 
				+            t1 = time.time()
			
 
				             for batch in range(batch_num):
			
 
				                 text_len_, word_ids_, tag_ids_ = sess.run([text_len, word_ids, tag_ids])
			
 
				                 # print(text_len_.shape, word_ids_.shape, tag_ids_.shape)
			
@@ -118,9 +149,10 @@ if __name__ == "__main__":
 
				 
			
 
				 
			
 
				                 if batch % 100==0:
			
 
				-                    print('loss_:', loss_, '\tglobel_step_:',globel_step_)
			
 
				+                    logging.info('loss_:%.4f,\tglobel_step_: %d'%(loss_, globel_step_))
			
 
				+                    print('耗时：', time.time()-t1)
			
 
				                 num += text_len_.shape[0]
			
 
				-            print('训练总数：%d, 样本总数：%d'%(num, total_num))
			
 
				+            # print('已训练数：%d, 样本总数：%d'%(num, total_num))
			
 
				 
			
 
				             results = []
			
 
				             trans = model.trans.eval()
			
@@ -154,15 +186,19 @@ if __name__ == "__main__":
 
				             recall = equal_num / (gold_num + 1e-10)
			
 
				             f1 = 2 * (precision * recall) / (precision + recall + 1e-10)
			
 
				             val_loss = np.mean(loss)
			
 
				-            print('epoch: %d, f1:%.4f, acc:%.4f, recall:%.4f, val_loss:%.4f'%(epoch, f1, precision, recall, val_loss))
			
 
				+            logging.info('epoch: %d, f1:%.4f, acc:%.4f, recall:%.4f, val_loss:%.4f'%(epoch, f1, precision, recall, val_loss))
			
 
				             if f1>max_f1:
			
 
				                 max_f1 = f1
			
 
				                 model.saver.save(sess, os.path.join(os.path.dirname(__file__)+'/model', "ner_epoch%d_f1%.4f_loss%.4f.ckpt"%(epoch,f1, val_loss)))
			
 
				-                print('save model, max_f1:%.4f' %f1)
			
 
				+                logging.info('save model, max_f1:%.4f' %f1)
			
 
				 
			
 
				         coord.request_stop()
			
 
				         coord.join(threads)
			
 
				 
			
 
				+if __name__ == "__main__":
			
 
				+    train()
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
 
				 
			
--- a/BiddingKG/dl/ratio/re_ratio.py
+++ b/BiddingKG/dl/ratio/re_ratio.py
@@ -3,7 +3,7 @@ from decimal import Decimal
 
				 # ratio = '([（(]?(上浮|下浮)(率|)(报价|)([(（]?%[）)]?|)[)）]?[：: ，]{0,3}[0-9]+.?[0-9]*[(（]?%?[）)]?)'
			
 
				 # ratio = '(([（(]?(上浮|下浮)费?(率|)(报价|)[)）]?|([中投]标|报价|总价)?费率|折扣率)([(（]?%[）)]?|)[）)]?[为：: ，]{0,3}[0-9]+\.?[0-9]{0,3}[(（]?%?[）)]?)'
			
 
				 
			
 
				-ratio = re.compile('(([（(]?(上浮|下浮)费?(率|)(报价|)[)）]?|([中投]标|报价|总价)?费率|折扣率|优惠率)([(（]?[%‰][）)]?|)(报价|取值|)([(（].{1,20}[）)])?[）)]?[为是：: ，]{0,3}'
			
 
				+ratio = re.compile('(([（(]?(上浮|下浮)费?(率|)(报价|)[)）]?|([中投]标|报价|总价)?费率|折扣率|折扣系数|优惠率)([(（]?[%‰][）)]?|)(报价|取值|)([(（].{1,20}[）)])?[）)]?[为是：: ，]{0,3}'
			
 
				                    '([0-9]{1,2}(?:\.[0-9]+)?[(（]?[%‰]?[）)]?|[百千]分之[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+(?:点[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+)?)'
			
 
				                    '|[0-9]{1,2}(?:\.[0-9]+)?[(（]?[%‰][）)]?[(（]?[\u4e00-\u9fa5]{,2}(?:费率|折扣率|优惠率|(上浮|下浮)费?率)[）)]?)')
			
 
				 ratio = ratio.pattern
			
@@ -182,7 +182,7 @@ def getUnifyNum(money):
 
				                         result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit))
			
 
				                 # subMoneys[0]中无金额单位，不可再拆分
			
 
				                 elif subMoneys[0] == "":
			
 
				-                    result += 0
			
 
				+                    result += getMultipleFactor(factorUnit)
			
 
				                 elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None:
			
 
				                     # print(subMoneys)
			
 
				                     # subMoneys[0] = subMoneys[0][0]
			
@@ -223,6 +223,7 @@ def test_str():
 
				 费率）:12
			
 
				 折扣率（%）：99.2063
			
 
				 投标报价：96.00%（折扣率
			
 
				+折扣系数:86(%)
			
 
				 '''
			
 
				     # s = '下浮率 百分之十点零陆(10.00%'
			
 
				     print(extract_ratio(s))
			
--- a/BiddingKG/dl/table_head/best_tiny.hdf5
+++ b/BiddingKG/dl/table_head/best_tiny.hdf5
--- a/BiddingKG/dl/table_head/best_tiny_230628.hdf5
+++ b/BiddingKG/dl/table_head/best_tiny_230628.hdf5
--- a/BiddingKG/dl/time/re_servicetime.py
+++ b/BiddingKG/dl/time/re_servicetime.py
@@ -20,9 +20,10 @@ TEST_MODE = False
 
				 
			
 
				 before = '(?P<before>' \
			
 
				          '合同期限|工期/交货期/服务期|工期，|工期\(交货期\)|合格工期|服务期限|工期' \
			
 
				-         '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期' \
			
 
				+         '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \
			
 
				          '|合格工期|计划工期\(服务期\)|服务期|服务，期|交货\(完工\)时间|交付\(服务、完工\)时间' \
			
 
				-         '|交货时间|工期|质保期' \
			
 
				+         '|交货时间|工期' \
			
 
				+         '|保洁期限|维保期|管理年限|工期承诺|(服务|合同|施工|实施|工程|设计)(年限|期限|周期|期：)' \
			
 
				          '|服务期限为|计划工期|工期要求|服务期限|服务期' \
			
 
				          '|投标工期|设计工期|合格服务周期|总工期|服务时间(范围)?|流转期限|维护期限|服务时限|交货期' \
			
 
				          '|完成时间|服务期限|中标工期|项目周期|期限要求|周期|供货期|合同履行日期|计划周期' \
			
@@ -61,7 +62,7 @@ before2 = '(?P<before2>' \
 
				         # '|[自从于].{2,15}之日[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾]{1,4}' \
			
 
				 
			
 
				 before3 = '(?P<before3>' \
			
 
				-          '([\(（]日历天[\)）]|[\(（]天[\)）]|[\(（]年[\)）]|[\(（]月[\)）])?' \
			
 
				+          '，?([\(（](日历天|施工时间)[\)）]|[\(（]天[\)）]|[\(（]年[\)）]|[\(（]月[\)）])?' \
			
 
				           ')'
			
 
				 
			
 
				 before4 = '(?P<before4>' \
			
@@ -98,7 +99,7 @@ number = '(?P<number>' \
 
				          ')'
			
 
				 
			
 
				 after = '(?P<after>' \
			
 
				-        '[个,，（\(]*(日历|工作|学|)([年月日天周]|周年|整年)(内|）|\)|)|' \
			
 
				+        '[个,，（\(]*(日历|历天|工作|学|)([年月日天周]|周年|整年)(内|）|\)|)|' \
			
 
				         ')'
			
 
				         # '|周|号|天|个月|个年|(（|\(|)年(）|\)|)|(（|\(|)月(）|\)|)|(（|\(|)日(）|\)|)' \
			
 
				         # '|个日历天|日历天|\(日历天\)|\(天\)|周内|，日历天|工作日|个工作日|' \
			
@@ -265,7 +266,7 @@ def filter_service_time(output_list, text_index_list):
 
				         if not re.findall(reg_right_digit, output):
			
 
				             delete_list.append([output, text_index_list[i]])
			
 
				             continue
			
 
				-        if not re.findall(reg_right_unit, output):
			
 
				+        if not re.findall(reg_right_unit, output) and not re.match('^\d{1,3}$', output):
			
 
				             delete_list.append([output, text_index_list[i]])
			
 
				             continue
			
 
				         # 包含不要的字
			
@@ -352,7 +353,10 @@ def extract_servicetime(text):
 
				 def test_from_str():
			
 
				     # s = """
			
 
				     # """
			
 
				-    s = "5元/年 服务期：交付使用之日起三年； 承诺服务等级"
			
 
				+    # s = "5元/年 服务期：交付使用之日起三年； 承诺服务等级"
			
 
				+    # s = "交货，1.交货时间：7天，2.交货地点：广东清远市清城区飞来峡镇人民政府高田应急安置点"
			
 
				+    s = '''，莆田市财政局走廊及卫生间吊顶改造工程中标结果公告，莆田市财政局走廊及卫生间吊顶改造工程，工程预算价236878元，发包价194240元，招标编号为：宏福莆招字【2020】H001号，该项目招标方式为：邀请招标。2020年04月07日开标，2020年04月07日评标完成，中标主要结果公示如下：中标人名称，福建省东海伟业建设有限公司，中标价:194240元，评标办法，随机抽取法，资格评审结果，注册建造师：合格：余爱华(注册编号：闽235141578763)，履约保证金(元)：合格：合同金额的10%，施工工期：14日历天，工程质量，备注，被确定为废标、无效标的投标人及原因：合格：无废标，资格审查小组：合格：王宗仙、林慧灵、谢淑青，根据评标结果确定福建省东海伟业建设有限公司为中标人，现在莆田市财政局网上(http://czj.putian.gov.cn/)公示。中标公示期自2020年04月08日至2020年04月10日。投标人对中标结果有异议或认为评标活动存在违法违规行为，可在公示期内向相关主管部门投诉，招标单位：招标代理机构：莆田市财政局，福建省宏福工程管理有限公司，联系电话：0594-2694413，联系电话：15160467775，2020年04月08日，2020年04月08日，
			
 
				+'''
			
 
				     print(extract_servicetime(s))
			
 
				     print(re.findall('(\d{2,4}[-.年/]|\d{1,2}[-.月/]|\d{1,2}[日号]?)+[-～~起至到—]+\d{2,4}[-.年/]', s))
			
 
				 
			
--- a/BiddingKG/readme/start.md
+++ b/BiddingKG/readme/start.md
@@ -9,6 +9,7 @@ cd /data/python
 
				 ps -ef | grep run_extract_server | grep -v grep | cut -c 9-16| xargs kill -9
			
 
				 #启动接口
			
 
				 nohup /data/anaconda3/envs/py37/bin/gunicorn -w 15 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
			
 
				+nohup gunicorn --workers 3 --limit-request-fields 0 --limit-request-line 0 -t 1000 --keep-alive 600 -b 192.168.2.102:15030 run_extract_server:app > extract.log 2>&1 &
			
 
				 #nohup /data/anaconda3/envs/py37/bin/python run_extract_server.py >> extract.log port=15030 worker=14 &
			
 
				 
			
 
				 #19022启动要素提取接口
			
--- a/BiddingKG/run_extract_server.py
+++ b/BiddingKG/run_extract_server.py
@@ -81,7 +81,7 @@ def run_thread(data,list_result):
 
				     web_source_no = data.get("web_source_no","")
			
 
				     web_source_name = data.get("web_source_name","")
			
 
				     original_docchannel = data.get("original_docchannel","")
			
 
				-    print("web_source_name:",web_source_name)
			
 
				+    # print("web_source_name:",web_source_name)
			
 
				     is_fail = False
			
 
				     try:
			
 
				         if _content!="":
			
@@ -98,7 +98,7 @@ def run_thread(data,list_result):
 
				     # 以json形式返回结果
			
 
				     #_resp = json.dumps(data_res,cls=MyEncoder)
			
 
				     #log(str(data["flag"])+str(data))
			
 
				-    log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
			
 
				+    # log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
			
 
				     list_result.append(data_res)
			
 
				     if is_fail:
			
 
				         list_result.append(is_fail)