|
@@ -200,6 +200,7 @@ import re
|
|
|
#
|
|
|
# return output_list[0], text_index_list[0]
|
|
|
|
|
|
+normal_bidway = "公开招标|邀请招标|竞争性谈判|竞争性磋商|单一来源|框架协议|询价"
|
|
|
|
|
|
bidway = '单一来源' \
|
|
|
'|国内竞争性磋商|竞争性磋商|竞争性谈判|网络竞价|网上竞价|公开竞谈|公开竞价|电子竞价|竞价|竞标|竞谈竞价|电子书面竞投' \
|
|
@@ -210,17 +211,17 @@ bidway = '单一来源' \
|
|
|
'|网上询价|公开询价|非定向询价|定向询价|询比价|询单|询价|询比' \
|
|
|
'|库内邀请|库内公开发包|内部邀标' \
|
|
|
'|定点采购议价|定点采购' \
|
|
|
- '|竞争性评审'
|
|
|
+ '|竞争性评审|框架协议'
|
|
|
|
|
|
not_bidway = '及单一来源|询价小组成员|除单一来源|竞争性谈判邀请函|询价记录|自由竞价' \
|
|
|
'|限时竞价|咨询单位|询价单'
|
|
|
|
|
|
-not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除"
|
|
|
+not_bidway_preffix = "本次|拟|参加|无效|标的|联合体|参与|否决|除|可以选择|包括|涉及|非"
|
|
|
|
|
|
not_bidway_suffix = "文件|报名|邀请|项目|失败|数量|编号|后|时间|类型|名称|和|成交" \
|
|
|
"|标题|开始|结束|产品|报价|供应商|部门|监督|需求|范围|入围|内容|人" \
|
|
|
"|条件|公司|保证金|完毕|事件|成功|活动|地点|标|会|须知|范围" \
|
|
|
- "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败"
|
|
|
+ "|响应|报价|采购公示|的原因|采购供应商|价|采购人员|失败|小组"
|
|
|
|
|
|
bidway_preffix = '采购方式|竞价方式|招标方式|询价类型|交易方式|寻源策略|招标形式|询价方式' \
|
|
|
'|发包方式|发包类型|开展方式|招标类型|选取方式|招租方式'
|
|
@@ -268,21 +269,64 @@ def re_standard_bidway(_str):
|
|
|
bidway_list = []
|
|
|
if match:
|
|
|
for m in match:
|
|
|
- m_dict = m.groupdict()
|
|
|
- m_span = m.span()
|
|
|
- keyword = ""
|
|
|
- keyword_index = [m_span[0], m_span[1]]
|
|
|
- for key in m_dict.keys():
|
|
|
- if key == "value":
|
|
|
- keyword = m_dict.get(key)
|
|
|
- else:
|
|
|
- keyword_index[0] += len(m_dict.get(key))
|
|
|
+ keyword = m.group('value')
|
|
|
+ keyword_index = list(m.span('value'))
|
|
|
+ behind_str = _str[m.start(): m.end()+30]
|
|
|
+ if len(re.findall(normal_bidway, behind_str))>1:
|
|
|
+ keyword = ''
|
|
|
+ for it in re.finditer('(?P<sign>.{1,2})(?P<bidway>'+normal_bidway+')+', behind_str): # 招标方式后面多个选择处理
|
|
|
+ if '□' != it.group('sign')[-1]:
|
|
|
+ keyword = it.group('bidway')
|
|
|
+ keyword_index = [m.start()+it.start('bidway'), m.start()+it.end('bidway')]
|
|
|
+ break
|
|
|
+ # m_dict = m.groupdict()
|
|
|
+ # m_span = m.span()
|
|
|
+ # keyword = ""
|
|
|
+ # keyword_index = [m_span[0], m_span[1]]
|
|
|
+ # for key in m_dict.keys():
|
|
|
+ # if key == "value":
|
|
|
+ # keyword = m_dict.get(key)
|
|
|
+ # else:
|
|
|
+ # keyword_index[0] += len(m_dict.get(key))
|
|
|
bidway_list.append([keyword, keyword_index])
|
|
|
|
|
|
return bidway_list
|
|
|
|
|
|
+def re_normal_bidway(_str):
|
|
|
+ ser = re.search("("+normal_bidway+")(转为?|变更为|更改为)"+"(?P<bidway>(" + normal_bidway + "))", _str) # 如果方式变更取变更后的
|
|
|
+ if ser:
|
|
|
+ return [[ser.group('bidway'), list(ser.span('bidway'))]]
|
|
|
+ reg_all = "(?P<value>" + normal_bidway + ")"
|
|
|
+ match = re.finditer(reg_all, _str)
|
|
|
+ bidway_list = []
|
|
|
+ bidway_set = set()
|
|
|
+ if match:
|
|
|
+ for m in match:
|
|
|
+ keyword = m.group()
|
|
|
+ if keyword == '公开招标' and m.start()>0 and _str[m.start()-1]=='非':
|
|
|
+ continue
|
|
|
+ keyword_index = list(m.span())
|
|
|
+ bidway_set.add(keyword)
|
|
|
+ bidway_list.append([keyword, keyword_index])
|
|
|
+ if len(bidway_list) == 0: # 如果找不到标准方式,匹配简称方式
|
|
|
+ ser = re.search('(?P<bidway>(磋商|谈判))(公告|成交|结果)', _str)
|
|
|
+ if ser:
|
|
|
+ return [[ser.group('bidway'), list(ser.span('bidway'))]]
|
|
|
+ if len(bidway_set) > 1: # 匹配到多种招标方式返回空
|
|
|
+ return []
|
|
|
+ return bidway_list
|
|
|
|
|
|
def re_all_bidway(_str):
|
|
|
+ reg_all = "(?P<value>" + normal_bidway + ")" # 优先匹配规范的招标方式
|
|
|
+ match = re.finditer(reg_all, _str)
|
|
|
+ bidway_list = []
|
|
|
+ if match:
|
|
|
+ for m in match:
|
|
|
+ keyword = m.group()
|
|
|
+ keyword_index = list(m.span())
|
|
|
+ bidway_list.append([keyword, keyword_index])
|
|
|
+ return bidway_list
|
|
|
+
|
|
|
reg_all = "(?P<value>" + bidway + ")"
|
|
|
match = re.finditer(reg_all, _str)
|
|
|
bidway_list = []
|
|
@@ -339,6 +383,13 @@ def get_one_word(bidway_list):
|
|
|
|
|
|
|
|
|
def re_bidway(text, title):
|
|
|
+ # 优先匹配标题标准招标方式
|
|
|
+ if len(title)<100:
|
|
|
+ bidway_list = re_normal_bidway(title)
|
|
|
+ if bidway_list:
|
|
|
+ word, text_index = get_one_word(bidway_list)
|
|
|
+ return word, text_index
|
|
|
+
|
|
|
# 替换易混淆词
|
|
|
text_clean = re_not_bidway(text)
|
|
|
title_clean = re_not_bidway(title)
|
|
@@ -406,12 +457,30 @@ bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
|
|
|
'网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
|
|
|
'竞争性磋商': '竞争性磋商', '采购方式:邀请': '邀请招标',
|
|
|
'公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
|
|
|
- '网上询价': '询价'}
|
|
|
+ '网上询价': '询价', '框架协议': '框架协议', '谈判':'竞争性谈判'}
|
|
|
# bidway名称统一规范
|
|
|
def bidway_integrate(bidway):
|
|
|
integrate_name = bidway_dict.get(bidway,"其他")
|
|
|
return integrate_name
|
|
|
|
|
|
+def bidway_normalize(key):
|
|
|
+ if re.search('公开招标|公开发包', key):
|
|
|
+ return '公开招标'
|
|
|
+ elif re.search('单一来源', key):
|
|
|
+ return '单一来源'
|
|
|
+ elif re.search('磋商', key):
|
|
|
+ return '竞争性磋商'
|
|
|
+ elif re.search('谈判', key):
|
|
|
+ return '竞争性谈判'
|
|
|
+ elif re.search('竞谈|竞价|竞投|竞标', key):
|
|
|
+ return '竞价'
|
|
|
+ elif re.search('询价|询比|比价|询单', key):
|
|
|
+ return '询价'
|
|
|
+ elif re.search('邀请|邀标', key):
|
|
|
+ return '邀请招标'
|
|
|
+ else:
|
|
|
+ return bidway_dict.get(key, '其他')
|
|
|
+
|
|
|
def test_csv():
|
|
|
df = pd.read_csv("C:\\Users\\Administrator\\Desktop\\bidway_text.csv")
|
|
|
|
|
@@ -456,13 +525,90 @@ def test_str():
|
|
|
|
|
|
|
|
|
def test_html():
|
|
|
- html_path = "C:/Users/Administrator/Desktop/3.html"
|
|
|
+ # html_path = "C:/Users/Administrator/Desktop/3.html"
|
|
|
+ html_path = 'd:/html/2.html'
|
|
|
|
|
|
- with open(html_path, "r") as f:
|
|
|
+ with open(html_path, "r", encoding='utf-8') as f:
|
|
|
s = f.read()
|
|
|
|
|
|
print(extract_bidway(s, title=""))
|
|
|
|
|
|
+def get_valuate():
|
|
|
+ import psycopg2
|
|
|
+ conn = psycopg2.connect(host='192.168.2.103', port='5432', user='postgres', password='postgres', dbname='iepy')
|
|
|
+ cursor = conn.cursor()
|
|
|
+ sql = "select c1.docid, c1.doctitle, c1.extract_json, c2.text from corpus_otherinput c1 left join corpus_iedocument c2 on c1.docid=c2.human_identifier where c1.new_extract notnull;" # where docid='110635873'
|
|
|
+ # sql = "select c1.docid, c1.doctitle from corpus_otherinput c1;"
|
|
|
+ # sql = "select text from corpus_iedocument limit 50000;"
|
|
|
+ cursor.execute(sql)
|
|
|
+ datas = []
|
|
|
+ olds = []
|
|
|
+ news = []
|
|
|
+ label_old = []
|
|
|
+ label_new = []
|
|
|
+ labels = []
|
|
|
+ for row in cursor.fetchall():
|
|
|
+ docid = row[0]
|
|
|
+ doctitle = row[1]
|
|
|
+ ex = row[2]
|
|
|
+ text = row[3]
|
|
|
+ ser = re.search('"bidway": "(\w{,6})"', ex)
|
|
|
+ # print('ser:', ser)
|
|
|
+ old = ser.group(1) if ser else ""
|
|
|
+ pred = extract_bidway(text, title=doctitle)
|
|
|
+
|
|
|
+ # list_bidway = extract_bidway(text, title=doctitle)
|
|
|
+ # print('list_bidway', list_bidway)
|
|
|
+ # if list_bidway:
|
|
|
+ # bidway = list_bidway[0].get("body")
|
|
|
+ # # bidway名称统一规范
|
|
|
+ # bidway = bidway_integrate(bidway)
|
|
|
+ # else:
|
|
|
+ # bidway = ""
|
|
|
+ # print('bidway: ', bidway)
|
|
|
+
|
|
|
+ pred = pred[0]['body'] if len(pred) > 0 else ""
|
|
|
+ new = bidway_dict.get(pred, "其他") if pred!="" else ""
|
|
|
+ sql2 = "select value from brat_bratannotation where document_id='{0}' and value like '%bidway%' limit 4;".format(docid)
|
|
|
+ cursor.execute(sql2)
|
|
|
+ lb_new = docid + "_"
|
|
|
+ lb_old = docid + "_"
|
|
|
+ tmp_l = []
|
|
|
+ for row in cursor.fetchall():
|
|
|
+ lb = row[0].split()[-1]
|
|
|
+ lb = bidway_dict.get(lb, "其他") # 新准确率:0.9642, 召回率: 0.9642, F1: 0.8965
|
|
|
+ # lb = bidway_normalize(lb) # 旧准确率:0.9287, 召回率: 0.9287, F1: 0.8011 新准确率:0.9692, 召回率: 0.9692, F1: 0.9105
|
|
|
+
|
|
|
+ tmp_l.append(lb)
|
|
|
+ if lb == new:
|
|
|
+ lb_new = docid + "_" + lb
|
|
|
+ if lb == old:
|
|
|
+ lb_old = docid + "_" + lb
|
|
|
+ olds.append(docid + "_" + old)
|
|
|
+ news.append(docid + "_" + new)
|
|
|
+ label_new.append(lb_new)
|
|
|
+ label_old.append(lb_old)
|
|
|
+ labels.append(';'.join(tmp_l))
|
|
|
+ datas.append((docid, docid + "_" + old, lb_old, docid + "_" + new, lb_new, ';'.join(tmp_l)))
|
|
|
+
|
|
|
+ eq_old = len(set(olds)&set(label_old))
|
|
|
+ eq_new = len(set(news)&set(label_new))
|
|
|
+
|
|
|
+ acc_old = eq_old/len(set(olds))
|
|
|
+ recall_old = eq_old/len(set(label_old))
|
|
|
+ f1_old = acc_old*recall_old/2*(acc_old+recall_old)
|
|
|
+
|
|
|
+ acc_new = eq_new/len(set(news))
|
|
|
+ recall_new = eq_new/len(set(label_new))
|
|
|
+ f1_new = acc_new*recall_new/2*(acc_new+recall_new)
|
|
|
+ print('旧准确率:%.4f, 召回率: %.4f, F1: %.4f'%(acc_old, recall_old, f1_old))
|
|
|
+ print('新准确率:%.4f, 召回率: %.4f, F1: %.4f'%(acc_new, recall_new, f1_new))
|
|
|
+
|
|
|
+
|
|
|
+ df = pd.DataFrame(datas, columns=['docid', 'pred_old', 'label_old', 'pred_new', 'label_new', 'labels'])
|
|
|
+ df['old_pos'] = df.apply(lambda x:1 if x['pred_old']==x['label_old'] else 0, axis=1)
|
|
|
+ df['new_pos'] = df.apply(lambda x:1 if x['pred_new']==x['label_new'] else 0, axis=1)
|
|
|
+ df.to_csv('E:/其他数据/招标方式预测结果.csv', index=False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
# extract_bidway(s)
|