import pandas as pd import psycopg2 import pickle import re def get_data(): ''' @summary: 取出待标注的数据到excel中 ''' conn = psycopg2.connect(dbname='BidiPro', user='postgres',password='postgres',host='192.168.2.101') cursor = conn.cursor() #sql = '''SELECT e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type,e.begin_index,e.end_index,s.tokens from entity_mention e,sentences s #WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND entity_type in ('person') ORDER BY doc_id,sentence_index,begin_index LIMIT 20000 #''' sql = '''SELECT e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type,e.begin_index,e.end_index,s.tokens from entity_mention e,sentences s WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND entity_type in ('person') and e.doc_id in (select id from articles_processed order by id desc limit 4000) ORDER BY doc_id,sentence_index,begin_index limit 20000 ''' cursor.execute(sql) rows = cursor.fetchmany(5000) new_df = pd.DataFrame() i = 0 while(rows): df = pd.DataFrame(rows, columns=['doc_id','entity_id','sentence_index','entity_text','entity_type','begin_index','end_index','tokens']) i += 1 new_df = pd.concat([new_df, df],ignore_index=True) #df.to_excel('data/person_'+str(i)+'.xls', encoding='utf-8',index=False) rows = cursor.fetchmany(5000) with open('data/person.pk', 'wb') as f: pickle.dump(new_df, f) #new_df.to_excel('data/person_total.xls', encoding='utf-8', index=False) #print(rows) cursor.close() conn.close() def label_data(): ''' @summary: 先通过规则预标注 ''' file2 = 'data/person_label.xls' with open('data/person.pk', 'rb') as f: data = pickle.load(f) # 分类:未知0 招标1 代理2 中标3 监督4 施工员5 联系人6 zhaobiao = re.compile('采购中心|采购单位|采购人|采购经办人|招标单位|建设单位|招标人|项目单位|比选人|发包人|项目业主') daili = re.compile('代理机构|招标代理|采购代理|采购机构|招标代理机构|招标代理人') # zhongbiao = re.compile('供应商|法人代表|法定代表|中标人|中标单位|中标候选人|第[一|二|三|1|2|3]名|中标项目|项目负责人|项目经理') jiandu = re.compile('评标|评审|审批|审查|评委|监标|专家|小组|成员|名单|监督|监管|监察|监审|主管|受理|处室|反映|异议|质疑|(\d{2}\.\d{2})[^\d]') shigong = re.compile('甲方代表|管理人员|管理机构人员|施工员|安全员|质检员|质量员|材料员|预算员|建造师|造价员|监理员|监理人员|项目总监') lianxi = re.compile('经办人|联系人|联系方式|联系电话|法人代表|法定代表|中标供应商|中标人|中标单位|中标候选人|第[一|二|三|1|2|3]名|中标项目|项目负责人|项目经理') pattern_pos = re.compile('联系方式|联系人|项目负责人|项目经理|法人|法定代表|级别及证书|采购人|第一|第二|第三|第1|第2|第3') #pattern = re.compile('采购代理|采购机构|采购人|代理机构|项目负责人|联系人|技术负责人|第一|第二|第三|中标人|中标供应商|中标机构|中标候选人|招标|代理|资质|法人代表') pattern_neg = re.compile('监管|监察|监督|主管|受理|处室|异议|反映|评委|评审|评标|监标|委员会|磋商|专家|小组|人员类别|管理人员|人员配备|成员|名单') count = 0 span = 10 tokens = data['tokens'] ben = data['begin_index'] end = data['end_index'] ent_id = data['entity_id'] ent = data['entity_text'] ent_type = data['entity_type'] sen_index = data['sentence_index'] pre_ent = [] cur_ent = [] label = [] # 标签列表 ent_idl = [] shiti = [] s_list = [] b_list = [] for i in range(len(tokens)): if ent_type[i] == 'person': begin1 = ben[i] - span if ben[i] > span else 0 end1 = end[i] + span if end[i] + span < len(tokens[i]) else len(tokens[i]) pre_ent.append(tokens[i][begin1:ben[i]]) cur_ent.append(tokens[i][end[i]:end1]) ent_idl.append(ent_id[i]) shiti.append(ent[i]) s_list.append(sen_index[i]) b_list.append(ben[i]) str_tok = ''.join(tokens[i][begin1:ben[i]]) str_tok = re.sub(',|\s','',str_tok) cur_tok = ''.join(tokens[i][begin1:ben[i]]) cur_tok = re.sub(',|\s','',cur_tok) if re.findall(jiandu, str_tok): flag = 0 elif re.findall(zhaobiao, str_tok): flag = 1 elif re.findall(daili, str_tok): flag = 2 # elif re.findall(zhongbiao, str_tok): # flag = 3 elif re.findall(shigong, str_tok): flag = 0 elif re.findall(lianxi, str_tok): flag = 3 else: flag = 0 count += 1 label.append(flag) else: pass new_data = {'pre_ent':pre_ent, 'label':label,'cur_ent':cur_ent, 'entity_id':ent_idl, 'shiti':shiti, 'sentence_index':s_list, 'begin_index':b_list} data_label = pd.DataFrame(new_data) data_label.to_excel(file2, encoding='utf-8', index=False, columns=['entity_id','sentence_index','begin_index','pre_ent','label','cur_ent','shiti']) with open('data/person_label.pk', 'wb') as f: pickle.dump(data_label, f) def post_data(): ''' @summary: 将标注好的数据推送到数据库 ''' conn = psycopg2.connect(dbname='BidiPro', user='postgres',password='postgres',host='192.168.2.101') cursor = conn.cursor() table = 'person_label' cursor.execute(" select to_regclass('"+table+"') is null ") notExists = cursor.fetchall()[0][0] if notExists: cursor.execute(" create table "+table+" (entity_id text,label int)") else: cursor.execute(" delete from "+table) df3 = pd.read_excel('data/person_label.xls', header=0) df3.head(3) entity_id = df3['entity_id'] label = df3['label'] for i in range(len(entity_id)): sql = " insert into "+table+"(entity_id,label) values('"+str(df3['entity_id'][i])+"',"+str(int(label[i]))+")" #print(sql) cursor.execute(sql) conn.commit() cursor.close() conn.close() if __name__ == '__main__': #get_data() label_data() post_data()