Explorar o código

优化项目编号、项目名称、招标方式;修复表头姓名修复导致的表头改为非表头问题;修复招标/代理斜线分割提取问题;

lsm %!s(int64=2) %!d(string=hai) anos
pai
achega
8724d4e57b

+ 161 - 15
BiddingKG/dl/bidway/re_bidway.py

@@ -200,6 +200,7 @@ import re
 #
 #     return output_list[0], text_index_list[0]
 
+normal_bidway = "公开招标|邀请招标|竞争性谈判|竞争性磋商|单一来源|框架协议|询价"
 
 bidway = '单一来源' \
          '|国内竞争性磋商|竞争性磋商|竞争性谈判|网络竞价|网上竞价|公开竞谈|公开竞价|电子竞价|竞价|竞标|竞谈竞价|电子书面竞投' \
@@ -210,17 +211,17 @@ bidway = '单一来源' \
          '|网上询价|公开询价|非定向询价|定向询价|询比价|询单|询价|询比' \
          '|库内邀请|库内公开发包|内部邀标' \
          '|定点采购议价|定点采购' \
-         '|竞争性评审'
+         '|竞争性评审|框架协议'
 
 not_bidway = '及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录|自由竞价' \
              '|限时竞价|咨询单位|询价单'
 
-not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除"
+not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除|可以选择|包括|涉及|非"
 
 not_bidway_suffix = "文件|报名|邀请|项目|失败|数量|编号|后|时间|类型|名称|和|成交" \
                     "|标题|开始|结束|产品|报价|供应商|部门|监督|需求|范围|入围|内容|人" \
                     "|条件|公司|保证金|完毕|事件|成功|活动|地点|标|会|须知|范围" \
-                    "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败"
+                    "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败|小组"
 
 bidway_preffix = '采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式' \
                  '|发包方式|发包类型|开展方式|招标类型|选取方式|招租方式'
@@ -268,21 +269,64 @@ def re_standard_bidway(_str):
     bidway_list = []
     if match:
         for m in match:
-            m_dict = m.groupdict()
-            m_span = m.span()
-            keyword = ""
-            keyword_index = [m_span[0], m_span[1]]
-            for key in m_dict.keys():
-                if key == "value":
-                    keyword = m_dict.get(key)
-                else:
-                    keyword_index[0] += len(m_dict.get(key))
+            keyword = m.group('value')
+            keyword_index = list(m.span('value'))
+            behind_str = _str[m.start(): m.end()+30]
+            if len(re.findall(normal_bidway, behind_str))>1:
+                keyword = ''
+                for it in re.finditer('(?P<sign>.{1,2})(?P<bidway>'+normal_bidway+')+', behind_str): # 招标方式后面多个选择处理
+                    if '□' != it.group('sign')[-1]:
+                        keyword = it.group('bidway')
+                        keyword_index = [m.start()+it.start('bidway'), m.start()+it.end('bidway')]
+                        break
+             # m_dict = m.groupdict()
+            # m_span = m.span()
+            # keyword = ""
+            # keyword_index = [m_span[0], m_span[1]]
+            # for key in m_dict.keys():
+            #     if key == "value":
+            #         keyword = m_dict.get(key)
+            #     else:
+            #         keyword_index[0] += len(m_dict.get(key))
             bidway_list.append([keyword, keyword_index])
 
     return bidway_list
 
+def re_normal_bidway(_str):
+    ser = re.search("("+normal_bidway+")(转为?|变更为|更改为)"+"(?P<bidway>(" + normal_bidway + "))", _str) # 如果方式变更取变更后的
+    if ser:
+        return [[ser.group('bidway'), list(ser.span('bidway'))]]
+    reg_all = "(?P<value>" + normal_bidway + ")"
+    match = re.finditer(reg_all, _str)
+    bidway_list = []
+    bidway_set = set()
+    if match:
+        for m in match:
+            keyword = m.group()
+            if keyword == '公开招标' and m.start()>0 and _str[m.start()-1]=='非':
+                continue
+            keyword_index = list(m.span())
+            bidway_set.add(keyword)
+            bidway_list.append([keyword, keyword_index])
+    if len(bidway_list) == 0: # 如果找不到标准方式,匹配简称方式
+        ser = re.search('(?P<bidway>(磋商|谈判))(公告|成交|结果)', _str)
+        if ser:
+            return [[ser.group('bidway'), list(ser.span('bidway'))]]
+    if len(bidway_set) > 1: # 匹配到多种招标方式返回空
+        return []
+    return bidway_list
 
 def re_all_bidway(_str):
+    reg_all = "(?P<value>" + normal_bidway + ")" # 优先匹配规范的招标方式
+    match = re.finditer(reg_all, _str)
+    bidway_list = []
+    if match:
+        for m in match:
+            keyword = m.group()
+            keyword_index = list(m.span())
+            bidway_list.append([keyword, keyword_index])
+    return bidway_list
+
     reg_all = "(?P<value>" + bidway + ")"
     match = re.finditer(reg_all, _str)
     bidway_list = []
@@ -339,6 +383,13 @@ def get_one_word(bidway_list):
 
 
 def re_bidway(text, title):
+    # 优先匹配标题标准招标方式
+    if len(title)<100:
+        bidway_list = re_normal_bidway(title)
+        if bidway_list:
+            word, text_index = get_one_word(bidway_list)
+            return word, text_index
+
     # 替换易混淆词
     text_clean = re_not_bidway(text)
     title_clean = re_not_bidway(title)
@@ -406,12 +457,30 @@ bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
                '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
                '竞争性磋商': '竞争性磋商', '采购方式:邀请': '邀请招标',
                '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
-               '网上询价': '询价'}
+               '网上询价': '询价', '框架协议': '框架协议', '谈判':'竞争性谈判'}
 # bidway名称统一规范
 def bidway_integrate(bidway):
     integrate_name = bidway_dict.get(bidway,"其他")
     return integrate_name
 
+def bidway_normalize(key):
+    if re.search('公开招标|公开发包', key):
+        return '公开招标'
+    elif re.search('单一来源', key):
+        return '单一来源'
+    elif re.search('磋商', key):
+        return '竞争性磋商'
+    elif re.search('谈判', key):
+        return '竞争性谈判'
+    elif re.search('竞谈|竞价|竞投|竞标', key):
+        return '竞价'
+    elif re.search('询价|询比|比价|询单', key):
+        return '询价'
+    elif re.search('邀请|邀标', key):
+        return '邀请招标'
+    else:
+        return bidway_dict.get(key, '其他')
+
 def test_csv():
     df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")
 
@@ -456,13 +525,90 @@ def test_str():
 
 
 def test_html():
-    html_path = "C:/Users/Administrator/Desktop/3.html"
+    # html_path = "C:/Users/Administrator/Desktop/3.html"
+    html_path = 'd:/html/2.html'
 
-    with open(html_path, "r") as f:
+    with open(html_path, "r", encoding='utf-8') as f:
         s = f.read()
 
     print(extract_bidway(s, title=""))
 
+def get_valuate():
+    import psycopg2
+    conn = psycopg2.connect(host='192.168.2.103', port='5432', user='postgres', password='postgres', dbname='iepy')
+    cursor = conn.cursor()
+    sql = "select c1.docid, c1.doctitle, c1.extract_json, c2.text from corpus_otherinput c1 left join corpus_iedocument c2 on c1.docid=c2.human_identifier where c1.new_extract notnull;" # where docid='110635873'
+    # sql = "select c1.docid, c1.doctitle from corpus_otherinput c1;"
+    # sql = "select text from corpus_iedocument limit 50000;"
+    cursor.execute(sql)
+    datas = []
+    olds = []
+    news = []
+    label_old = []
+    label_new = []
+    labels = []
+    for row in cursor.fetchall():
+        docid = row[0]
+        doctitle = row[1]
+        ex = row[2]
+        text = row[3]
+        ser = re.search('"bidway": "(\w{,6})"', ex)
+        # print('ser:', ser)
+        old = ser.group(1) if ser else ""
+        pred = extract_bidway(text, title=doctitle)
+
+        # list_bidway = extract_bidway(text, title=doctitle)
+        # print('list_bidway', list_bidway)
+        # if list_bidway:
+        #     bidway = list_bidway[0].get("body")
+        #     # bidway名称统一规范
+        #     bidway = bidway_integrate(bidway)
+        # else:
+        #     bidway = ""
+        # print('bidway: ', bidway)
+
+        pred = pred[0]['body'] if len(pred) > 0 else ""
+        new = bidway_dict.get(pred, "其他") if pred!="" else ""
+        sql2 = "select value from brat_bratannotation where document_id='{0}' and value like '%bidway%' limit 4;".format(docid)
+        cursor.execute(sql2)
+        lb_new = docid + "_"
+        lb_old = docid + "_"
+        tmp_l = []
+        for row in cursor.fetchall():
+            lb = row[0].split()[-1]
+            lb = bidway_dict.get(lb, "其他")  # 新准确率:0.9642, 召回率: 0.9642, F1: 0.8965
+            # lb = bidway_normalize(lb)   # 旧准确率:0.9287, 召回率: 0.9287, F1: 0.8011  新准确率:0.9692, 召回率: 0.9692, F1: 0.9105
+
+            tmp_l.append(lb)
+            if lb == new:
+                lb_new = docid + "_" + lb
+            if lb == old:
+                lb_old = docid + "_" + lb
+        olds.append(docid + "_" + old)
+        news.append(docid + "_" + new)
+        label_new.append(lb_new)
+        label_old.append(lb_old)
+        labels.append(';'.join(tmp_l))
+        datas.append((docid, docid + "_" + old, lb_old, docid + "_" + new, lb_new, ';'.join(tmp_l)))
+
+    eq_old = len(set(olds)&set(label_old))
+    eq_new = len(set(news)&set(label_new))
+
+    acc_old = eq_old/len(set(olds))
+    recall_old = eq_old/len(set(label_old))
+    f1_old = acc_old*recall_old/2*(acc_old+recall_old)
+
+    acc_new = eq_new/len(set(news))
+    recall_new = eq_new/len(set(label_new))
+    f1_new = acc_new*recall_new/2*(acc_new+recall_new)
+    print('旧准确率:%.4f, 召回率: %.4f, F1: %.4f'%(acc_old, recall_old, f1_old))
+    print('新准确率:%.4f, 召回率: %.4f, F1: %.4f'%(acc_new, recall_new, f1_new))
+
+
+    df = pd.DataFrame(datas, columns=['docid', 'pred_old', 'label_old', 'pred_new', 'label_new', 'labels'])
+    df['old_pos'] = df.apply(lambda x:1 if x['pred_old']==x['label_old'] else 0, axis=1)
+    df['new_pos'] = df.apply(lambda x:1 if x['pred_new']==x['label_new'] else 0, axis=1)
+    df.to_csv('E:/其他数据/招标方式预测结果.csv', index=False)
 
 if __name__ == "__main__":
     # extract_bidway(s)

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 0 - 2
BiddingKG/dl/interface/Preprocessing.py


+ 42 - 12
BiddingKG/dl/interface/predictor.py

@@ -277,8 +277,8 @@ class CodeNamePredict():
     
     def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
         #@summary: 获取每篇文章的code和name
-        pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
-
+        # pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
+        pattern_score = re.compile('建设项目|服务项目|工程项目|工程施工|建设工程|服务中心|基础设施|物业管理|工程设计|妇幼保健|咨询服务|管理系统|管理中心|改建工程|配套工程|公安局|幼儿园|管理局|使用权|办公楼|教育局|管理处|图书馆|经营权|项目|采购|工程|改造|服务|设备|中心|医院|系统|建设|监理|施工|维修|学院|安装|设计|关于|标段|招标|技术|询价|管理|学校|小学|中学|平台|提升|设施|检测|整治|社区|装修|政府|绿化|物资|租赁|地块|医疗|编制|公开|规划|监控|教育|维护|校区|治理|升级|安置|竞价|购置|评估|勘察|承包|实验|大学|材料|生产|耗材|招租|硬化|维保|用地|消防|审计|拍卖|物业|入围|养护|机关|企业|用房|出让|资产|分局|验收|宣传|处置|校园|研究|咨询|修缮|更换|装饰|劳务|保养|物流|出租|局|院')
         result = []
         index_unk = self.word2index.get("<unk>")
         # index_pad = self.word2index.get("<pad>")
@@ -393,20 +393,40 @@ class CodeNamePredict():
 
                                 #add code to entitys
                                 list_entity.append(temp_entitys[h])
-
-                                if the_code not in code_set:
+                                if re.search(',|/|;|、|,', the_code) and len(the_code)>25:
+                                    for it in re.split(',|/|;|、|,', the_code):
+                                        if len(it) > 8:
+                                            if it not in code_set:
+                                                code_set.add(it)
+                                                item['code'].append(it)
+                                        elif len(item['code']) > 0:
+                                            new_it = item['code'][-1] + re.search(',|/|;|、|,', the_code).group(0) + it
+                                            if new_it not in code_set:
+                                                code_set.add(new_it)
+                                                item['code'][-1] = new_it
+                                        else:
+                                            if the_code not in code_set:
+                                                code_set.add(the_code)
+                                                item['code'].append(the_code)
+                                            break
+                                elif the_code not in code_set:
                                     code_set.add(the_code)
-                                    item['code'] = list(code_set)
+                                    item['code'].append(the_code)
+
+                                # if the_code not in code_set:
+                                #     code_set.add(the_code)
+                                #     item['code'] = list(code_set)
                     for iter in re.finditer(self.PN_pattern,join_predict):
                         _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
 
                         #add name to entitys
                         _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
                         list_entity.append(_entity)
-                        w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
+                        # w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
+                        w = 1 if re.search('(项目|工程|招标|采购|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题|项目)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
                         if _name not in dict_name_freq_score:
                             # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
-                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w+(5-sentence.sentence_index)*0.2]
                         else:
                             dict_name_freq_score[_name][0] += 1
                     '''
@@ -423,18 +443,21 @@ class CodeNamePredict():
                 _begin_index += _LEN
             
             list_name_freq_score = []
+            # print('模型预测项目名称:', dict_name_freq_score)
 
             # 2020/11/23 大网站规则调整
             if len(dict_name_freq_score) == 0:
-                name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
+                # name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
+                name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[::\s]+(?P<name>[^,。:;]{2,60})[,。]'
                 for sentence in list_sentence:
                     # pad_sentence = sentence.sentence_text
                     othername = re.search(name_re1, sentence.sentence_text)
                     if othername != None:
-                        project_name = othername.group(3)
+                        project_name = othername.group('name')
                         beg = find_index([project_name], sentence.sentence_text)[0]
                         end = beg + len(project_name)
                         _name = self.fitDataByRule(sentence.sentence_text[beg:end])
+                        # print('规则召回项目名称:', _name)
                         # add name to entitys
                         _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
                         sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
@@ -444,7 +467,7 @@ class CodeNamePredict():
                         w = 1
                         if _name not in dict_name_freq_score:
                             # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
-                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w+(5-sentence.sentence_index)*0.2]
                         else:
                             dict_name_freq_score[_name][0] += 1
                 # othername = re.search(name_re1, sentence.sentence_text)
@@ -461,6 +484,8 @@ class CodeNamePredict():
             if len(list_name_freq_score)>0:
                 list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
                 item['name'] = list_name_freq_score[0][0]
+                # for it in list_name_freq_score:
+                    # print('项目名称及分值:',it[0],it[1], it[1][0]*it[1][1])
                 # if list_name_freq_score[0][1][0]>1:
                 #     item[1]['name'] = list_name_freq_score[0][0]
                 # else:
@@ -474,9 +499,10 @@ class CodeNamePredict():
                     # if othercode != None:
                     #     item[1]['code'].append(othercode.group(2))
                     # 2020/11/23 大网站规则调整
-                    othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
+                    othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
                     if othercode != None:
-                        item['code'].append(othercode.group(3))
+                        item['code'].append(othercode.group('code'))
+                        # print('规则召回项目编号:', othercode.group('code'))
             item['code'] = [code for code in item['code'] if len(code)<500]
             item['code'].sort(key=lambda x:len(x),reverse=True)
             result.append(item)
@@ -692,6 +718,7 @@ class PREMPredict():
             text_tup = text_list[i]
             front, middle, behind = text_tup
             whole = "".join(text_tup)
+            # print('模型预测角色:', front, entity.entity_text, label, values)
             if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
                 label = 5
             elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
@@ -770,6 +797,7 @@ class PREMPredict():
             whole = "".join(text_tup)
             # print('金额: ', entity.entity_text, label, values, front, middle, behind)
             if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
+                # print('模型预测金额: ', entity.entity_text, label, values, front, middle, behind)
                 label = 2
             elif label == 1: # 错误中标金额处理
                 if re.search('[::,。](总金额|总价|单价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
@@ -1435,6 +1463,7 @@ class RoleRulePredictor():
                                         self.pattern_money_other, _span[0]) is None:
                                     p_entity.values[0] = 0.8 + p_entity.values[0] / 10
                                     p_entity.label = 0
+                                    # print('规则召回预算金额:', p_entity.entity_text, _span[0])
                                 if re.search(self.pattern_money_tenderer, _span[0]) is not None:
                                     if re.search(self.pattern_money_other, _span[0]) is not None:
                                         if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
@@ -1453,6 +1482,7 @@ class RoleRulePredictor():
                                         , _sentence.sentence_text[:p_entity.wordOffset_begin]): # 处理几个标段金额相邻情况 例子:191705231
                                     p_entity.values[0] = 0.8 + p_entity.values[0] / 10
                                     p_entity.label = 0
+                                    # print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
 
             # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
             list_p = []

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio