123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148 |
- import pandas as pd
- import psycopg2
- import pickle
- import re
- def get_data():
- '''
- @summary: 取出待标注的数据到excel中
- '''
- conn = psycopg2.connect(dbname='BidiPro', user='postgres',password='postgres',host='192.168.2.101')
- cursor = conn.cursor()
- #sql = '''SELECT e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type,e.begin_index,e.end_index,s.tokens from entity_mention e,sentences s
- #WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND entity_type in ('person') ORDER BY doc_id,sentence_index,begin_index LIMIT 20000
- #'''
- sql = '''SELECT e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type,e.begin_index,e.end_index,s.tokens from entity_mention e,sentences s
- WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND entity_type in ('person')
- and e.doc_id in (select id from articles_processed order by id desc limit 4000) ORDER BY doc_id,sentence_index,begin_index limit 20000
- '''
- cursor.execute(sql)
- rows = cursor.fetchmany(5000)
- new_df = pd.DataFrame()
- i = 0
- while(rows):
- df = pd.DataFrame(rows, columns=['doc_id','entity_id','sentence_index','entity_text','entity_type','begin_index','end_index','tokens'])
- i += 1
- new_df = pd.concat([new_df, df],ignore_index=True)
- #df.to_excel('data/person_'+str(i)+'.xls', encoding='utf-8',index=False)
- rows = cursor.fetchmany(5000)
- with open('data/person.pk', 'wb') as f:
- pickle.dump(new_df, f)
- #new_df.to_excel('data/person_total.xls', encoding='utf-8', index=False)
- #print(rows)
- cursor.close()
- conn.close()
-
- def label_data():
- '''
- @summary: 先通过规则预标注
- '''
- file2 = 'data/person_label.xls'
- with open('data/person.pk', 'rb') as f:
- data = pickle.load(f)
- # 分类:未知0 招标1 代理2 中标3 监督4 施工员5 联系人6
- zhaobiao = re.compile('采购中心|采购单位|采购人|采购经办人|招标单位|建设单位|招标人|项目单位|比选人|发包人|项目业主')
- daili = re.compile('代理机构|招标代理|采购代理|采购机构|招标代理机构|招标代理人')
- # zhongbiao = re.compile('供应商|法人代表|法定代表|中标人|中标单位|中标候选人|第[一|二|三|1|2|3]名|中标项目|项目负责人|项目经理')
- jiandu = re.compile('评标|评审|审批|审查|评委|监标|专家|小组|成员|名单|监督|监管|监察|监审|主管|受理|处室|反映|异议|质疑|(\d{2}\.\d{2})[^\d]')
- shigong = re.compile('甲方代表|管理人员|管理机构人员|施工员|安全员|质检员|质量员|材料员|预算员|建造师|造价员|监理员|监理人员|项目总监')
- lianxi = re.compile('经办人|联系人|联系方式|联系电话|法人代表|法定代表|中标供应商|中标人|中标单位|中标候选人|第[一|二|三|1|2|3]名|中标项目|项目负责人|项目经理')
-
- pattern_pos = re.compile('联系方式|联系人|项目负责人|项目经理|法人|法定代表|级别及证书|采购人|第一|第二|第三|第1|第2|第3')
- #pattern = re.compile('采购代理|采购机构|采购人|代理机构|项目负责人|联系人|技术负责人|第一|第二|第三|中标人|中标供应商|中标机构|中标候选人|招标|代理|资质|法人代表')
- pattern_neg = re.compile('监管|监察|监督|主管|受理|处室|异议|反映|评委|评审|评标|监标|委员会|磋商|专家|小组|人员类别|管理人员|人员配备|成员|名单')
- count = 0
- span = 10
-
- tokens = data['tokens']
- ben = data['begin_index']
- end = data['end_index']
- ent_id = data['entity_id']
- ent = data['entity_text']
- ent_type = data['entity_type']
- sen_index = data['sentence_index']
-
- pre_ent = []
- cur_ent = []
- label = [] # 标签列表
- ent_idl = []
- shiti = []
- s_list = []
- b_list = []
-
- for i in range(len(tokens)):
-
- if ent_type[i] == 'person':
- begin1 = ben[i] - span if ben[i] > span else 0
- end1 = end[i] + span if end[i] + span < len(tokens[i]) else len(tokens[i])
- pre_ent.append(tokens[i][begin1:ben[i]])
- cur_ent.append(tokens[i][end[i]:end1])
- ent_idl.append(ent_id[i])
- shiti.append(ent[i])
- s_list.append(sen_index[i])
- b_list.append(ben[i])
- str_tok = ''.join(tokens[i][begin1:ben[i]])
- str_tok = re.sub(',|\s','',str_tok)
- cur_tok = ''.join(tokens[i][begin1:ben[i]])
- cur_tok = re.sub(',|\s','',cur_tok)
- if re.findall(jiandu, str_tok):
- flag = 0
- elif re.findall(zhaobiao, str_tok):
- flag = 1
- elif re.findall(daili, str_tok):
- flag = 2
- # elif re.findall(zhongbiao, str_tok):
- # flag = 3
-
- elif re.findall(shigong, str_tok):
- flag = 0
- elif re.findall(lianxi, str_tok):
- flag = 3
- else:
- flag = 0
- count += 1
- label.append(flag)
- else:
- pass
-
-
- new_data = {'pre_ent':pre_ent, 'label':label,'cur_ent':cur_ent, 'entity_id':ent_idl, 'shiti':shiti, 'sentence_index':s_list, 'begin_index':b_list}
- data_label = pd.DataFrame(new_data)
- data_label.to_excel(file2, encoding='utf-8', index=False, columns=['entity_id','sentence_index','begin_index','pre_ent','label','cur_ent','shiti'])
- with open('data/person_label.pk', 'wb') as f:
- pickle.dump(data_label, f)
- def post_data():
- '''
- @summary: 将标注好的数据推送到数据库
- '''
- conn = psycopg2.connect(dbname='BidiPro', user='postgres',password='postgres',host='192.168.2.101')
- cursor = conn.cursor()
- table = 'person_label'
- cursor.execute(" select to_regclass('"+table+"') is null ")
- notExists = cursor.fetchall()[0][0]
- if notExists:
- cursor.execute(" create table "+table+" (entity_id text,label int)")
- else:
- cursor.execute(" delete from "+table)
-
- df3 = pd.read_excel('data/person_label.xls', header=0)
- df3.head(3)
-
- entity_id = df3['entity_id']
- label = df3['label']
- for i in range(len(entity_id)):
-
- sql = " insert into "+table+"(entity_id,label) values('"+str(df3['entity_id'][i])+"',"+str(int(label[i]))+")"
- #print(sql)
- cursor.execute(sql)
- conn.commit()
- cursor.close()
- conn.close()
- if __name__ == '__main__':
- #get_data()
- label_data()
- post_data()
|