luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
							import ast
import pandas as pd
import re

# from BiddingKG.dl.interface import Entitys


# def re_bidway_old(text):
#     df = pd.read_csv("C:\\Users\\admin\\Desktop\\bidway_text.csv")
#
#     reg = re.compile(u'(采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式'
#                      u'|发包方式|发包类型|开展方式|招标类型)(.*)'
#                      u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
#                      u'|电子书面竞投|邀请招标|定向公开|询价采购|抽签摇号'
#                      u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
#                      u'|网上招标|其他'
#                      u'|竞谈竞价|网上直购|公开竞谈'
#                      u'|库内邀请|库内公开发包)')
#
#     # reg = re.compile(u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源|网络竞价'
#     #                  u'|竞争性谈判|公开询价|邀请招标|公开招募|公开询比价|电子书面竞投'
#     #                  u'|网上电子投标|比质比价|定向询单|国内比选|电子竞价'
#     #                  u'|公开招租|公开竞标方式|网上招标|公开招标|国内竞争性谈判'
#     #                  u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
#     #                  u'|库内邀请|询价采购|询比采购|分散采购|单一来源采购)')
#
#     reg2 = re.compile(u'(采用|以|)'
#                       u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
#                       u'|竞争性谈判|询价|电子书面竞投|电子竞价'
#                       u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
#                       u'|网上招标|分散采购'
#                       u'|竞谈竞价|网上直购|公开竞谈'
#                       u'|库内邀请)'
#                       u'(采购方式|方式)')
#
#     reg1 = re.compile(
#         # u'(公开招标|竞争性磋商|竞争性谈判采购|公开采购|单一来源采购|网络竞价|公开招商方式'
#         # u'|竞争性谈判|公开询价|询价采购|邀请招标|公开招募|公开询比|电子书面竞投'
#         # u'|网上电子投标|比质比价|定向询单|询比采购|国内比选|单一来源|公开选取|库内公开发包'
#         # u'|公开招租|公开竞标方式|网上招标|公开招标|竞争性谈判|公开招投标'
#         # u'|国内竞争性磋商|公开竞谈|定向询价|网上询价|网上竞价|公开比选|磋商采购|网上直购'
#         # u'|国际公开竞争性招标)'
#         u'(公开招标|竞争性磋商|竞争性谈判|公开采购|单一来源'
#         u'|竞争性谈判|询价|电子书面竞投'
#         u'|网上电子投标|比质比价|询单|询比采购|比选|单一来源采购'
#         u'|网上招标|分散采购'
#         u'|竞谈竞价|网上直购|公开竞谈'
#         u'|库内邀请)'
#     )
#
#     # 都切为4个字符
#     # reg1_not = re.compile(u'(及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录)')
#     reg1_not = re.compile(u'(及单一来|价小组成|除单一来|性谈判邀|询价记录)')
#
#     reg3 = re.compile(u'(采购方式：邀请|采购方式：公开|采购方式：询价|分散采购|公开招标|竞价|磋商|询比|竞标|邀请招标|公开招募|公开招租)')
#
#
#     reg_standard = re.compile(u'(公开招标|竞争性磋商|竞争性谈判|单一来源'
#                               u'|竞争性谈判|询价|邀请招标|公开招募|询比|电子书面竞投'
#                               u'|网上电子投标|比质比价|询单|比选'
#                               u'|公开招租|网上招标|分散采购'
#                               u'|网上直购|公开竞谈|采购方式：邀请|采购方式：公开|采购方式：询价)'
#                               )
#
#     text_list = df["text"].to_list()
#     # text_list = []
#     # text_list.append(text)
#     text_index_list = []
#     output_list = []
#     for index in range(len(text_list)):
#         # 全文下标
#         text_index = [0, 0]
#
#         input_str = text_list[index]
#
#         # 把一些混淆的词先替换掉
#         input_str = re.sub(reg1_not, "####", input_str)
#
#         match = reg.search(input_str)
#         output_str = None
#         # 根据正则表达式匹配
#         if match:
#             # 更新全文下标
#             text_index[0] = match.start()
#             text_index[1] = match.end()
#
#             # 判断长度，截断
#             if len(match.group()) >= 15:
#                 ss = re.split(",|\.|，|。|;|；", match.group())
#                 # 判断所需的字符串在哪一段
#                 for i in range(len(ss)):
#                     if re.search(reg1, ss[i]):
#                         output_str = ss[i]
#
#                         # 更新全文下标
#                         front_len, back_len = calculateLen(ss, i)
#                         text_index[0] = text_index[0] + front_len + i
#                         text_index[1] = text_index[1] - back_len + len(ss) -1 - i
#
#                         break
#             else:
#                 output_str = match.group()
#
#         else:
#             match2 = re.search(reg2, input_str)
#             if match2:
#                 # 更新全文下标
#                 text_index[0] = match2.start()
#                 text_index[1] = match2.end()
#
#                 output_str = match2.group()
#
#             else:
#                 match1 = re.search(reg1, input_str)
#                 if match1:
#                     # 更新全文下标
#                     text_index[0] = match1.start()
#                     text_index[1] = match1.end()
#                     output_str = match1.group()
#
#         # 再判断一次长度
#         if output_str is not None:
#             if len(output_str) >= 15:
#                 match2 = re.search(reg2, input_str)
#                 if match2:
#                     # 更新全文下标
#                     text_index[0] = match2.start()
#                     text_index[1] = match2.end()
#
#                     output_str = match2.group()
#             if len(output_str) >= 15:
#                 match1 = re.search(reg1, input_str)
#                 if match1:
#                     # 更新全文下标
#                     text_index[0] = match1.start()
#                     text_index[1] = match1.end()
#
#                     output_str = match1.group()
#
#         # 最后输出还为空，匹配一些易混淆的词
#         if output_str is None:
#             match3 = re.search(reg3, input_str)
#             if match3:
#                 # 更新全文下标
#                 text_index[0] = match3.start()
#                 text_index[1] = match3.end()
#
#                 output_str = match3.group()
#
#         # 处理前缀等无用词
#         if output_str is not None:
#             match5 = re.search("分散采购|采购方式：邀请", output_str)
#             if not match5:
#                 # 公开采购转为公开招标
#                 output_str = re.sub("公开采购", "公开招标", output_str)
#
#                 # 去掉第一个字符冒号
#                 ss = re.split("：|:", output_str)
#                 output_str = ss[-1]
#                 # 更新全文下标
#                 front_len, back_len = calculateLen(ss, len(ss) - 1)
#                 text_index[0] = text_index[0] + front_len + len(ss) - 1
#
#                 # 去掉采购、方式、采用
#                 match6 = re.search("(采用|出售|直接（|现就本次|招标为)", output_str)
#                 match7 = re.search("(采购|方式|进行)", output_str)
#                 output_str = re.sub("(采购|方式|采用|出售|进行|直接（|现就本次|招标为)", "", output_str)
#                 # 更新全文下标
#                 if match6:
#                     text_index[0] += match6.end() - match6.start()
#                 if match7:
#                     text_index[1] -= match7.end() - match7.start()
#
#             # 使用标准标签过滤
#             match4 = re.search(reg_standard, output_str)
#             if match4:
#                 output_str = match4.group()
#                 # 更新全文下标
#                 text_index[0] += match4.start()
#                 text_index[1] = text_index[0] + match4.end() - match4.start()
#
#         output_list.append(output_str)
#         # text_index_list.append(str(text_index))
#         text_index_list.append(text_index)
#
#     # df["re"] = pd.DataFrame(output_list)
#     # df["text_index"] = pd.DataFrame(text_index_list)
#
#     # index_to_word = []
#     # for index, row in df.iterrows():
#     #     i_list = ast.literal_eval(row["text_index"])
#     #     word = row["text"][i_list[0]:i_list[1]]
#     #     if len(word) >= 20:
#     #         word = ""
#     #     index_to_word.append(word)
#
#
#     # df["index2word"] = pd.DataFrame(index_to_word)
#     # df.to_csv("C:\\Users\\admin\\Desktop\\bidway_text2.csv")
#
#     return output_list[0], text_index_list[0]

normal_bidway = "公开招标|邀请招标|竞争性谈判|竞争性磋商|单一来源|框架协议|询价"

bidway = '单一来源' \
         '|国内竞争性磋商|竞争性磋商|竞争性谈判|网络竞价|网上竞价|公开竞谈|公开竞价|电子竞价|竞价|竞标|竞谈竞价|电子书面竞投' \
         '|公开比选|比质比价|比选' \
         '|公开招标|公开招租|公开招募|公开选取|公开招投标' \
         '|网上直购|网上招标|网上电子投标|网上挂牌' \
         '|邀请招标' \
         '|网上询价|公开询价|非定向询价|定向询价|询比价|询单|询价|询比' \
         '|库内邀请|库内公开发包|内部邀标' \
         '|定点采购议价|定点采购' \
         '|竞争性评审|框架协议'

not_bidway = '及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录|自由竞价' \
             '|限时竞价|咨询单位|询价单'

not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除|可以选择|包括|涉及|非"

not_bidway_suffix = "文件|报名|邀请|项目|失败|数量|编号|后|时间|类型|名称|和|成交" \
                    "|标题|开始|结束|产品|报价|供应商|部门|监督|需求|范围|入围|内容|人" \
                    "|条件|公司|保证金|完毕|事件|成功|活动|地点|标|会|须知|范围" \
                    "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败|小组"

bidway_preffix = '采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式' \
                 '|发包方式|发包类型|开展方式|招标类型|选取方式|招租方式'

bidway_special = '采购方式：公开|采购方式：邀请|采购方式：询价' \
                 '|招标方式：.公开|采购方式：.公开' \
                 '|分散采购' \
                 ''


def re_not_bidway(_str):
    match = re.findall(not_bidway, _str)
    if match:
        for word in match:
            instead = "#" * len(word)
            _str = re.sub(word, instead, _str)

    reg_not1 = "(" + bidway + ")" + "(" + not_bidway_suffix + ")"
    match = re.findall(reg_not1, _str)
    if match:
        for word in match:
            word_add = ""
            for w in word:
                word_add += w
            instead = "#" * len(word_add)
            _str = re.sub(word_add, instead, _str)

    reg_not2 = "(" + not_bidway_preffix + ")" + "(" + bidway + ")"
    match = re.findall(reg_not2, _str)
    if match:
        for word in match:
            word_add = ""
            for w in word:
                word_add += w
            instead = "#" * len(word_add)
            _str = re.sub(word_add, instead, _str)
    return _str


def re_standard_bidway(_str):
    reg_standard = "(?P<preffix>" + bidway_preffix + ")" \
                   + "(?P<char>.{1,2})" \
                   + "(?P<value>" + bidway + ")"
    match = re.finditer(reg_standard, _str)
    bidway_list = []
    if match:
        for m in match:
            keyword = m.group('value')
            keyword_index = list(m.span('value'))
            behind_str = _str[m.start(): m.end()+30]
            if len(re.findall(normal_bidway, behind_str))>1:
                keyword = ''
                for it in re.finditer('(?P<sign>.{1,2})(?P<bidway>'+normal_bidway+')+', behind_str): # 招标方式后面多个选择处理
                    if '□' != it.group('sign')[-1]:
                        keyword = it.group('bidway')
                        keyword_index = [m.start()+it.start('bidway'), m.start()+it.end('bidway')]
                        break
             # m_dict = m.groupdict()
            # m_span = m.span()
            # keyword = ""
            # keyword_index = [m_span[0], m_span[1]]
            # for key in m_dict.keys():
            #     if key == "value":
            #         keyword = m_dict.get(key)
            #     else:
            #         keyword_index[0] += len(m_dict.get(key))
            bidway_list.append([keyword, keyword_index])

    return bidway_list

def re_normal_bidway(_str):
    ser = re.search("("+normal_bidway+")(转为?|变更为|更改为)"+"(?P<bidway>(" + normal_bidway + "))", _str) # 如果方式变更取变更后的
    if ser:
        return [[ser.group('bidway'), list(ser.span('bidway'))]]
    reg_all = "(?P<value>" + normal_bidway + ")"
    match = re.finditer(reg_all, _str)
    bidway_list = []
    bidway_set = set()
    if match:
        for m in match:
            keyword = m.group()
            if keyword == '公开招标' and m.start()>0 and _str[m.start()-1]=='非':
                continue
            keyword_index = list(m.span())
            bidway_set.add(keyword)
            bidway_list.append([keyword, keyword_index])
    if len(bidway_list) == 0: # 如果找不到标准方式，匹配简称方式
        ser = re.search('(?P<bidway>(磋商|谈判))(公告|成交|结果)', _str)
        if ser:
            return [[ser.group('bidway'), list(ser.span('bidway'))]]
    if len(bidway_set) > 1: # 匹配到多种招标方式返回空
        return []
    return bidway_list

def re_all_bidway(_str):
    reg_all = "(?P<value>" + normal_bidway + ")" # 优先匹配规范的招标方式
    match = re.finditer(reg_all, _str)
    bidway_list = []
    if match:
        for m in match:
            keyword = m.group()
            keyword_index = list(m.span())
            bidway_list.append([keyword, keyword_index])
    return bidway_list

    reg_all = "(?P<value>" + bidway + ")"
    match = re.finditer(reg_all, _str)
    bidway_list = []
    if match:
        for m in match:
            keyword = m.group()
            keyword_index = list(m.span())
            bidway_list.append([keyword, keyword_index])
    return bidway_list


def re_special_bidway(_str):
    reg_special = "(?P<value>" + bidway_special + ")"
    match = re.finditer(reg_special, _str)
    bidway_list = []
    if match:
        for m in match:
            keyword = m.group()
            keyword_index = list(m.span())
            bidway_list.append([keyword, keyword_index])
    return bidway_list


def get_one_word(bidway_list):
    # 若有多个，去重，输出较长的
    word = None
    text_index = [0, 0]
    if len(bidway_list) > 1:
        word_dict = {}
        for bw in bidway_list:
            if bw[0] in word_dict.keys():
                if bw[1][0] < word_dict.get(bw[0])[0]:
                    word_dict[bw[0]] = bw[1]
            else:
                word_dict[bw[0]] = bw[1]

        word_list = []
        for key in word_dict.keys():
            word_list.append([key, word_dict.get(key)[0]])

        if len(word_list) > 1:
            word_list.sort(key=lambda x: (-int(x[1]), len(x[0])))
            word = word_list[-1][0]
            text_index = word_dict.get(word)
        elif word_list:
            word = word_list[0][0]
            text_index = word_dict.get(word)
        else:
            text_index = [0, 0]
    elif len(bidway_list) == 1:
        word = bidway_list[0][0]
        text_index = bidway_list[0][1]
    return word, text_index


def re_bidway(text, title):
    # 优先匹配标题标准招标方式
    if len(title)<100:
        bidway_list = re_normal_bidway(title)
        if bidway_list:
            word, text_index = get_one_word(bidway_list)
            return word, text_index

    # 替换易混淆词
    text_clean = re_not_bidway(text)
    title_clean = re_not_bidway(title)

    # 查找符合标准形式的
    bidway_list = re_standard_bidway(text_clean)
    if bidway_list:
        word = bidway_list[0][0]
        text_index = bidway_list[0][1]
        return word, text_index

    # 无符合标准形式的，查找title里的所有形式
    bidway_list = re_all_bidway(title_clean)
    if bidway_list:
        word, text_index = get_one_word(bidway_list)
        return word, text_index

    # 无符合标准形式的，查找所有形式
    bidway_list = re_all_bidway(text_clean)
    if bidway_list:
        word, text_index = get_one_word(bidway_list)
        return word, text_index

    # 还无结果，查找特殊形式
    bidway_list = re_special_bidway(text_clean)
    if bidway_list:
        word = bidway_list[0][0]
        text_index = bidway_list[0][1]
        return word, text_index

    # 查无结果
    return None, [0, 0]


def extract_bidway(text, title):
    list_bidway = []
    word, text_index_list = re_bidway(text, title)
    if word is not None:
        if text_index_list[1]-text_index_list[0] != len(word) \
                or text_index_list[1]-text_index_list[0] >= 10:
            return []
        d = {"body": word, "begin_index": text_index_list[0], "end_index": text_index_list[1]}
        list_bidway.append(d)
        # print(d.get("body"), d.get("begin_index"), d.get("end_index"))
    return list_bidway

bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
               '公开比选': '其他', '国内竞争性磋商': '竞争性磋商',
               '招标方式：t公开': '公开招标', '竞价': '竞价',
               '竞标': '竞价', '电子竞价': '竞价',
               '电子书面竞投': '竞价', '单一来源': '单一来源',
               '网上竞价': '竞价', '公开招标': '公开招标',
               '询比': '询价', '定点采购': '其他',
               '招标方式：■公开': '公开招标', '交易其他，付款其他': '其他',
               '竞争性评审': '竞争性磋商', '公开招租': '其他', '\\N': '',
               '比选': '其他', '比质比价': '其他', '分散采购': '其他',
               '内部邀标': '邀请招标', '邀请招标': '邀请招标',
               '网上招标': '公开招标', '非定向询价': '询价',
               '网络竞价': '竞价', '公开询价': '询价',
               '定点采购议价': '其他', '询单': '询价',
               '网上挂牌': '其他', '网上直购': '其他',
               '定向询价': '询价', '采购方式：公开': '公开招标',
               '磋商': '竞争性磋商', '公开招投标': '公开招标',
               '招标方式：√公开': '公开招标', '公开选取': '公开招标',
               '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
               '竞争性磋商': '竞争性磋商', '采购方式：邀请': '邀请招标',
               '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
               '网上询价': '询价', '框架协议': '框架协议', '谈判':'竞争性谈判'}
# bidway名称统一规范
def bidway_integrate(bidway):
    integrate_name = bidway_dict.get(bidway,"其他")
    return integrate_name

def bidway_normalize(key):
    if re.search('公开招标|公开发包', key):
        return '公开招标'
    elif re.search('单一来源', key):
        return '单一来源'
    elif re.search('磋商', key):
        return '竞争性磋商'
    elif re.search('谈判', key):
        return '竞争性谈判'
    elif re.search('竞谈|竞价|竞投|竞标', key):
        return '竞价'
    elif re.search('询价|询比|比价|询单', key):
        return '询价'
    elif re.search('邀请|邀标', key):
        return '邀请招标'
    else:
        return bidway_dict.get(key, '其他')

def test_csv():
    df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")

    predict_list = []
    for index, row in df.iterrows():
        word, text_index = re_bidway(row["text"], "")
        if word:
            predict = [word, text_index]
        else:
            predict = []
        print("predict", predict)
        predict_list.append(str(predict))

    predict_df = pd.DataFrame(predict_list)
    df = pd.concat([df, predict_df], axis=1)

    df.to_csv("C:\\Users\\Administrator\\Desktop\\bidway_result.csv")
    print("finish write!")


def test_str():
    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
    s = '''
    ，关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知，各投标人：深圳市国际招标有限公司受中共
    深圳市委军民融合发展委员会办公室委托，就人防工程技术咨询服务项目【重新招标】(项目编号：0658-2171
    1A60965)，进行公开招标，因投标单位不足三家，公开招标失败，现经采购单位同意，采用单一来源谈判方式
    确定中标供应商，邀请中国建筑标准设计研究院有限公司前来谈判，一、项目编号：0658-21711A60965，二
    、项目名称：人防工程技术咨询服务项目【重新招标】，三、凡被邀请参加谈判的供应商必须按照原招标文件第
    六章要求制作谈判文件正本一本，副本二本，按规定的时间密封递交并参加谈判，四、谈判内容：投标价格、项
    目实施方案、售后服务方案和其它相关事项，五、地点及时间：1、因疫情影响本项目谈判响应文件采用邮寄方
    式接收文件，2、文件接收截止时间：2021年11月5日14：30(北京时间)，3、谈判响应文件邮寄地址：深圳
    市罗湖区嘉宾路2018号深华商业大厦裙楼6层600A。收件人：郑工，电话：18806665013，3、谈判地点：
    线上谈判，六、谈判的相关规则按原招标文件的相应规定执行；有关谈判事宜详见招标文件第六章《公开招标失
    败后后续采购程序和投标须知》，1、采购人信息，名称：中共深圳市委军民融合发展委员会办公室，地址：深
    圳市福田区新洲路5008号，联系方式：刘先生，电话：0755-88100332，2、采购代理机构信息，名称：深
    圳市国际招标有限公司，地址：罗湖总部：深圳市罗湖区嘉宾路2018号深华商业大厦裙楼6层，深圳湾总部：深
    圳市南山区沙河西路与白石路交汇处深圳湾科技生态园9栋B4座6楼，联系方式：0755-22918634，监督举报
    电话：0755-22965602、0755-86660475，特此通知，深圳市国际招标有限公司，2021年11月1日，更多
    咨询报价请点击：http://zbcloud.net/bidbulletin/69495.htm，
    '''
    print(extract_bidway(s, title=""))


def test_html():
    # html_path = "C:/Users/Administrator/Desktop/3.html"
    html_path = 'd:/html/2.html'

    with open(html_path, "r", encoding='utf-8') as f:
        s = f.read()

    print(extract_bidway(s, title=""))

def get_valuate():
    import psycopg2
    conn = psycopg2.connect(host='192.168.2.103', port='5432', user='postgres', password='postgres', dbname='iepy')
    cursor = conn.cursor()
    sql = "select c1.docid, c1.doctitle, c1.extract_json, c2.text from corpus_otherinput c1 left join corpus_iedocument c2 on c1.docid=c2.human_identifier where c1.new_extract notnull;" # where docid='110635873'
    # sql = "select c1.docid, c1.doctitle from corpus_otherinput c1;"
    # sql = "select text from corpus_iedocument limit 50000;"
    cursor.execute(sql)
    datas = []
    olds = []
    news = []
    label_old = []
    label_new = []
    labels = []
    for row in cursor.fetchall():
        docid = row[0]
        doctitle = row[1]
        ex = row[2]
        text = row[3]
        ser = re.search('"bidway": "(\w{,6})"', ex)
        # print('ser:', ser)
        old = ser.group(1) if ser else ""
        pred = extract_bidway(text, title=doctitle)

        # list_bidway = extract_bidway(text, title=doctitle)
        # print('list_bidway', list_bidway)
        # if list_bidway:
        #     bidway = list_bidway[0].get("body")
        #     # bidway名称统一规范
        #     bidway = bidway_integrate(bidway)
        # else:
        #     bidway = ""
        # print('bidway: ', bidway)

        pred = pred[0]['body'] if len(pred) > 0 else ""
        new = bidway_dict.get(pred, "其他") if pred!="" else ""
        sql2 = "select value from brat_bratannotation where document_id='{0}' and value like '%bidway%' limit 4;".format(docid)
        cursor.execute(sql2)
        lb_new = docid + "_"
        lb_old = docid + "_"
        tmp_l = []
        for row in cursor.fetchall():
            lb = row[0].split()[-1]
            lb = bidway_dict.get(lb, "其他")  # 新准确率：0.9642, 召回率： 0.9642, F1: 0.8965
            # lb = bidway_normalize(lb)   # 旧准确率：0.9287, 召回率： 0.9287, F1: 0.8011  新准确率：0.9692, 召回率： 0.9692, F1: 0.9105

            tmp_l.append(lb)
            if lb == new:
                lb_new = docid + "_" + lb
            if lb == old:
                lb_old = docid + "_" + lb
        olds.append(docid + "_" + old)
        news.append(docid + "_" + new)
        label_new.append(lb_new)
        label_old.append(lb_old)
        labels.append('；'.join(tmp_l))
        datas.append((docid, docid + "_" + old, lb_old, docid + "_" + new, lb_new, '；'.join(tmp_l)))

    eq_old = len(set(olds)&set(label_old))
    eq_new = len(set(news)&set(label_new))

    acc_old = eq_old/len(set(olds))
    recall_old = eq_old/len(set(label_old))
    f1_old = acc_old*recall_old/2*(acc_old+recall_old)

    acc_new = eq_new/len(set(news))
    recall_new = eq_new/len(set(label_new))
    f1_new = acc_new*recall_new/2*(acc_new+recall_new)
    print('旧准确率：%.4f, 召回率： %.4f, F1: %.4f'%(acc_old, recall_old, f1_old))
    print('新准确率：%.4f, 召回率： %.4f, F1: %.4f'%(acc_new, recall_new, f1_new))


    df = pd.DataFrame(datas, columns=['docid', 'pred_old', 'label_old', 'pred_new', 'label_new', 'labels'])
    df['old_pos'] = df.apply(lambda x:1 if x['pred_old']==x['label_old'] else 0, axis=1)
    df['new_pos'] = df.apply(lambda x:1 if x['pred_new']==x['label_new'] else 0, axis=1)
    df.to_csv('E:/其他数据/招标方式预测结果.csv', index=False)

if __name__ == "__main__":
    # extract_bidway(s)

    # test_csv()
    test_str()
    # test_html()
    pass