general_data.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. import pandas as pd
  2. import psycopg2
  3. import pickle
  4. import re
  5. def get_data():
  6. '''
  7. @summary: 取出待标注的数据到excel中
  8. '''
  9. conn = psycopg2.connect(dbname='BidiPro', user='postgres',password='postgres',host='192.168.2.101')
  10. cursor = conn.cursor()
  11. #sql = '''SELECT e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type,e.begin_index,e.end_index,s.tokens from entity_mention e,sentences s
  12. #WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND entity_type in ('person') ORDER BY doc_id,sentence_index,begin_index LIMIT 20000
  13. #'''
  14. sql = '''SELECT e.doc_id,e.entity_id,e.sentence_index,e.entity_text,e.entity_type,e.begin_index,e.end_index,s.tokens from entity_mention e,sentences s
  15. WHERE s.doc_id=e.doc_id AND s.sentence_index=e.sentence_index AND entity_type in ('person')
  16. and e.doc_id in (select id from articles_processed order by id desc limit 4000) ORDER BY doc_id,sentence_index,begin_index limit 20000
  17. '''
  18. cursor.execute(sql)
  19. rows = cursor.fetchmany(5000)
  20. new_df = pd.DataFrame()
  21. i = 0
  22. while(rows):
  23. df = pd.DataFrame(rows, columns=['doc_id','entity_id','sentence_index','entity_text','entity_type','begin_index','end_index','tokens'])
  24. i += 1
  25. new_df = pd.concat([new_df, df],ignore_index=True)
  26. #df.to_excel('data/person_'+str(i)+'.xls', encoding='utf-8',index=False)
  27. rows = cursor.fetchmany(5000)
  28. with open('data/person.pk', 'wb') as f:
  29. pickle.dump(new_df, f)
  30. #new_df.to_excel('data/person_total.xls', encoding='utf-8', index=False)
  31. #print(rows)
  32. cursor.close()
  33. conn.close()
  34. def label_data():
  35. '''
  36. @summary: 先通过规则预标注
  37. '''
  38. file2 = 'data/person_label.xls'
  39. with open('data/person.pk', 'rb') as f:
  40. data = pickle.load(f)
  41. # 分类:未知0 招标1 代理2 中标3 监督4 施工员5 联系人6
  42. zhaobiao = re.compile('采购中心|采购单位|采购人|采购经办人|招标单位|建设单位|招标人|项目单位|比选人|发包人|项目业主')
  43. daili = re.compile('代理机构|招标代理|采购代理|采购机构|招标代理机构|招标代理人')
  44. # zhongbiao = re.compile('供应商|法人代表|法定代表|中标人|中标单位|中标候选人|第[一|二|三|1|2|3]名|中标项目|项目负责人|项目经理')
  45. jiandu = re.compile('评标|评审|审批|审查|评委|监标|专家|小组|成员|名单|监督|监管|监察|监审|主管|受理|处室|反映|异议|质疑|(\d{2}\.\d{2})[^\d]')
  46. shigong = re.compile('甲方代表|管理人员|管理机构人员|施工员|安全员|质检员|质量员|材料员|预算员|建造师|造价员|监理员|监理人员|项目总监')
  47. lianxi = re.compile('经办人|联系人|联系方式|联系电话|法人代表|法定代表|中标供应商|中标人|中标单位|中标候选人|第[一|二|三|1|2|3]名|中标项目|项目负责人|项目经理')
  48. pattern_pos = re.compile('联系方式|联系人|项目负责人|项目经理|法人|法定代表|级别及证书|采购人|第一|第二|第三|第1|第2|第3')
  49. #pattern = re.compile('采购代理|采购机构|采购人|代理机构|项目负责人|联系人|技术负责人|第一|第二|第三|中标人|中标供应商|中标机构|中标候选人|招标|代理|资质|法人代表')
  50. pattern_neg = re.compile('监管|监察|监督|主管|受理|处室|异议|反映|评委|评审|评标|监标|委员会|磋商|专家|小组|人员类别|管理人员|人员配备|成员|名单')
  51. count = 0
  52. span = 10
  53. tokens = data['tokens']
  54. ben = data['begin_index']
  55. end = data['end_index']
  56. ent_id = data['entity_id']
  57. ent = data['entity_text']
  58. ent_type = data['entity_type']
  59. sen_index = data['sentence_index']
  60. pre_ent = []
  61. cur_ent = []
  62. label = [] # 标签列表
  63. ent_idl = []
  64. shiti = []
  65. s_list = []
  66. b_list = []
  67. for i in range(len(tokens)):
  68. if ent_type[i] == 'person':
  69. begin1 = ben[i] - span if ben[i] > span else 0
  70. end1 = end[i] + span if end[i] + span < len(tokens[i]) else len(tokens[i])
  71. pre_ent.append(tokens[i][begin1:ben[i]])
  72. cur_ent.append(tokens[i][end[i]:end1])
  73. ent_idl.append(ent_id[i])
  74. shiti.append(ent[i])
  75. s_list.append(sen_index[i])
  76. b_list.append(ben[i])
  77. str_tok = ''.join(tokens[i][begin1:ben[i]])
  78. str_tok = re.sub(',|\s','',str_tok)
  79. cur_tok = ''.join(tokens[i][begin1:ben[i]])
  80. cur_tok = re.sub(',|\s','',cur_tok)
  81. if re.findall(jiandu, str_tok):
  82. flag = 0
  83. elif re.findall(zhaobiao, str_tok):
  84. flag = 1
  85. elif re.findall(daili, str_tok):
  86. flag = 2
  87. # elif re.findall(zhongbiao, str_tok):
  88. # flag = 3
  89. elif re.findall(shigong, str_tok):
  90. flag = 0
  91. elif re.findall(lianxi, str_tok):
  92. flag = 3
  93. else:
  94. flag = 0
  95. count += 1
  96. label.append(flag)
  97. else:
  98. pass
  99. new_data = {'pre_ent':pre_ent, 'label':label,'cur_ent':cur_ent, 'entity_id':ent_idl, 'shiti':shiti, 'sentence_index':s_list, 'begin_index':b_list}
  100. data_label = pd.DataFrame(new_data)
  101. data_label.to_excel(file2, encoding='utf-8', index=False, columns=['entity_id','sentence_index','begin_index','pre_ent','label','cur_ent','shiti'])
  102. with open('data/person_label.pk', 'wb') as f:
  103. pickle.dump(data_label, f)
  104. def post_data():
  105. '''
  106. @summary: 将标注好的数据推送到数据库
  107. '''
  108. conn = psycopg2.connect(dbname='BidiPro', user='postgres',password='postgres',host='192.168.2.101')
  109. cursor = conn.cursor()
  110. table = 'person_label'
  111. cursor.execute(" select to_regclass('"+table+"') is null ")
  112. notExists = cursor.fetchall()[0][0]
  113. if notExists:
  114. cursor.execute(" create table "+table+" (entity_id text,label int)")
  115. else:
  116. cursor.execute(" delete from "+table)
  117. df3 = pd.read_excel('data/person_label.xls', header=0)
  118. df3.head(3)
  119. entity_id = df3['entity_id']
  120. label = df3['label']
  121. for i in range(len(entity_id)):
  122. sql = " insert into "+table+"(entity_id,label) values('"+str(df3['entity_id'][i])+"',"+str(int(label[i]))+")"
  123. #print(sql)
  124. cursor.execute(sql)
  125. conn.commit()
  126. cursor.close()
  127. conn.close()
  128. if __name__ == '__main__':
  129. #get_data()
  130. label_data()
  131. post_data()