projectLabel.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. #encoding:utf-8
  2. import codecs
  3. import json
  4. import psycopg2
  5. from collections import Counter
  6. from keras.preprocessing.sequence import pad_sequences
  7. import numpy
  8. import re
  9. projectName_pattern_str="工程|项目|系统"
  10. def importProjectData():
  11. '''
  12. @summary:导入原算法处理的编号和名称数据
  13. '''
  14. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  15. cursor = conn.cursor()
  16. file = "C:\\Users\\User\\Desktop\\bxkc.zhongbiao_extraction.json"
  17. with codecs.open(file,"r",encoding="utf8") as f:
  18. data = f.read().strip()
  19. f.close()
  20. data = "["+data.replace("}","},")[:-1]+"]"
  21. for item in json.loads(data):
  22. if len(item.keys())!=3:
  23. continue
  24. doc_id = item["document_id"]
  25. projectCode = item["project_code"]
  26. projectName = item["project_name"]
  27. sql = " insert into project_compare(doc_id,projectcode,projectname) values('"+doc_id+"','"+projectCode+"','"+projectName+"')"
  28. print(sql)
  29. cursor.execute(sql)
  30. conn.commit()
  31. conn.close()
  32. def generateDatas(MAX_LEN,min_freq=1,vocab_set=None):
  33. '''
  34. @summary:从数据库中查询标注数据并数值化训练测试数据
  35. @param:
  36. MAX_LEN:句子的最大长度
  37. min_freq:字出现的最小频次
  38. vocab_set:预训练字向量的字典,若为None,则使用标注数据生成
  39. @return:
  40. train_process:经过数值化的训练数据
  41. test_process:经过数据化的测试数据
  42. vocab:字典
  43. chunk_tags:标签
  44. '''
  45. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  46. cursor = conn.cursor()
  47. sql = " select A.content,B.projectcode,B.projectname,B.doc_id from articles_processed A,project_relabel B where A.id=B.doc_id "
  48. order_sql = " order by B.doc_id "
  49. train_sql = " and B.doc_id not in (select doc_id from project order by doc_id limit 300) "
  50. test_sql = " and B.doc_id in (select doc_id from project order by doc_id limit 300) "
  51. def hasNotBeenLabeled(items,code_begin,code):
  52. for i in range(code_begin,code_begin+len(code)):
  53. if items[i][1]!="O":
  54. return False
  55. return True
  56. def findAllIndex(substr,wholestr):
  57. copystr = wholestr
  58. result = []
  59. indexappend = 0
  60. while(True):
  61. index = copystr.find(substr)
  62. if index<0:
  63. break
  64. else:
  65. result.append(indexappend+index)
  66. indexappend += index+len(substr)
  67. copystr = copystr[index+len(substr):]
  68. return result
  69. def replaceCharAndNumber(data):
  70. char_replace = re.compile("[a-zA-Z]")
  71. number_replace = re.compile("[0-9]")
  72. for i in range(len(data)):
  73. for j in range(len(data[i])):
  74. data[i][j][0] = re.sub(number_replace,"N",re.sub(char_replace,"C",data[i][j][0]))
  75. print(sql+train_sql+order_sql)
  76. cursor.execute(sql+train_sql+order_sql)
  77. rows = cursor.fetchall()
  78. train = []
  79. for row in rows:
  80. code = row[1] if row[1]!="" else ""
  81. name = row[2] if row[2]!="" else ""
  82. codes = re.split("[;;]",code)
  83. codes.sort(key = lambda x:len(x),reverse = True)
  84. names = re.split("[;;]",name)
  85. names.sort(key = lambda x:len(x),reverse = True)
  86. contents = re.split("。",str(row[0]))
  87. for content in contents:
  88. out_of_len = False
  89. double_flag = False
  90. data_item = []
  91. for i in range(len(str(content))):
  92. data_item_item = []
  93. data_item_item.append(content[i])
  94. data_item_item.append("O")
  95. data_item.append(data_item_item)
  96. code_find_flag = False
  97. name_find_flag = False
  98. for code in codes:
  99. if len(code)==0:
  100. continue
  101. code_begins = findAllIndex(code,content)
  102. for code_begin in code_begins:
  103. if code_begin<MAX_LEN and (code_begin+len(code))>MAX_LEN:
  104. out_of_len = True
  105. if code_begin>5:
  106. double_begin = code_begin-5
  107. else:
  108. double_begin = 0
  109. if len(re.findall("编号|名称",content[double_begin:code_begin]))>0:
  110. double_flag = True
  111. code_find_flag = True
  112. if len(code)==1:
  113. if hasNotBeenLabeled(data_item, code_begin, code):
  114. data_item[code_begin][1] = "PC_S"
  115. else:
  116. if hasNotBeenLabeled(data_item, code_begin, code):
  117. for j in range(code_begin,code_begin+len(code)):
  118. if j==code_begin:
  119. data_item[j][1] = "PC_B"
  120. elif j==code_begin+len(code)-1:
  121. data_item[j][1] = "PC_E"
  122. else:
  123. data_item[j][1] = "PC_M"
  124. for name in names:
  125. if len(name)==0:
  126. continue
  127. name_begins = findAllIndex(name,content)
  128. for name_begin in name_begins:
  129. if name_begin<MAX_LEN and (name_begin+len(name))>MAX_LEN:
  130. out_of_len = True
  131. if name_begin>5:
  132. double_begin = name_begin-5
  133. else:
  134. double_begin = 0
  135. if len(re.findall("编号|名称",content[double_begin:name_begin]))>0:
  136. double_flag = True
  137. name_find_flag = True
  138. if len(name)==1:
  139. if hasNotBeenLabeled(data_item, name_begin, name):
  140. data_item[name_begin][1] = "PC_S"
  141. else:
  142. if hasNotBeenLabeled(data_item, name_begin, name):
  143. for j in range(name_begin,name_begin+len(name)):
  144. if j==name_begin:
  145. data_item[j][1] = "PN_B"
  146. elif j==name_begin+len(name)-1:
  147. data_item[j][1] = "PN_E"
  148. else:
  149. data_item[j][1] = "PN_M"
  150. if code_find_flag or name_find_flag:
  151. if not out_of_len:
  152. if double_flag:
  153. train.append(data_item)
  154. train.append(data_item)
  155. else:
  156. if numpy.random.random()<=0.05:
  157. train.append(data_item)
  158. print(sql+test_sql+order_sql)
  159. cursor.execute(sql+test_sql+order_sql)
  160. rows = cursor.fetchall()
  161. test = []
  162. list_docid_content = []
  163. for row in rows:
  164. code = row[1] if row[1]!="" else ""
  165. name = row[2] if row[2]!="" else ""
  166. codes = re.split("[;;]",code)
  167. codes.sort(key = lambda x:len(x),reverse = True)
  168. names = re.split("[;;]",name)
  169. names.sort(key = lambda x:len(x),reverse = True)
  170. contents = re.split("。",str(row[0]))
  171. for content in contents:
  172. data_item = []
  173. for i in range(len(str(content))):
  174. data_item_item = []
  175. data_item_item.append(content[i])
  176. data_item_item.append("O")
  177. data_item.append(data_item_item)
  178. code_find_flag = False
  179. name_find_flag = False
  180. for code in codes:
  181. if len(code)==0:
  182. continue
  183. code_begins = findAllIndex(code,content)
  184. for code_begin in code_begins:
  185. code_find_flag = True
  186. if len(code)==1:
  187. if hasNotBeenLabeled(data_item, code_begin, code):
  188. data_item[code_begin][1] = "PC_S"
  189. else:
  190. if hasNotBeenLabeled(data_item, code_begin, code):
  191. for j in range(code_begin,code_begin+len(code)):
  192. if j==code_begin:
  193. data_item[j][1] = "PC_B"
  194. elif j==code_begin+len(code)-1:
  195. data_item[j][1] = "PC_E"
  196. else:
  197. data_item[j][1] = "PC_M"
  198. for name in names:
  199. if len(name)==0:
  200. continue
  201. name_begins = findAllIndex(name,content)
  202. for name_begin in name_begins:
  203. name_find_flag = True
  204. if len(name)==1:
  205. if hasNotBeenLabeled(data_item, name_begin, name):
  206. data_item[name_begin][1] = "PC_S"
  207. else:
  208. if hasNotBeenLabeled(data_item, name_begin, name):
  209. for j in range(name_begin,name_begin+len(name)):
  210. if j==name_begin:
  211. data_item[j][1] = "PN_B"
  212. elif j==name_begin+len(name)-1:
  213. data_item[j][1] = "PN_E"
  214. else:
  215. data_item[j][1] = "PN_M"
  216. if code_find_flag or name_find_flag:
  217. test.append(data_item)
  218. doc_id_content = [row[3],content]
  219. list_docid_content.append(doc_id_content)
  220. else:
  221. if numpy.random.random()<=0.05:
  222. test.append(data_item)
  223. doc_id_content = [row[3],content]
  224. list_docid_content.append(doc_id_content)
  225. #replaceCharAndNumber(train)
  226. #replaceCharAndNumber(test)
  227. word_counts = Counter(row[0] for sample in train for row in sample)
  228. chunk_tags = sorted(list(set(row[1] for sample in train + test for row in sample))) # in alphabetic order
  229. if vocab_set is not None:
  230. vocab = vocab_set
  231. else:
  232. vocab = ['<pad>', '<unk>'] + [w for w, f in iter(word_counts.items()) if f >= min_freq]
  233. with codecs.open("viewTest.txt","w",encoding="utf8") as f:
  234. for t in test:
  235. for h in t:
  236. f.write(str(h[0])+" "+str(h[1]))
  237. f.write("\n")
  238. f.write("$#\n")
  239. f.flush()
  240. f.close()
  241. with codecs.open("viewTrain.txt","w",encoding="utf8") as f:
  242. for t in train:
  243. for h in t:
  244. f.write(str(h[0])+" "+str(h[1]))
  245. f.write("\n")
  246. f.write("$#\n")
  247. f.flush()
  248. f.close()
  249. with codecs.open("docid_content.txt","w",encoding="utf8") as f:
  250. for t in list_docid_content:
  251. f.write(t[0]+" "+t[1])
  252. f.write("\n")
  253. f.flush()
  254. train_process = _process_data(train, vocab, chunk_tags,MAX_LEN)
  255. test_process = _process_data(test, vocab, chunk_tags,MAX_LEN)
  256. #return (train[0][:-200],train[1][:-200]),(train[0][-200:],train[1][-200:]),(vocab, chunk_tags)
  257. return train_process, test_process, (vocab, chunk_tags),test
  258. def _process_data(data, vocab,chunk_tags, maxlen):
  259. '''
  260. @summary:处理数据
  261. @param:
  262. data:句子list
  263. vocab:字典
  264. chunk_tags:标签list
  265. maxlen:限定最大的句子长度,若不设置则为最长的句子的长度
  266. @return:
  267. x:经过补全和数值化的数据
  268. y_chunk:对应的标签
  269. '''
  270. if maxlen is None:
  271. maxlen = max(len(s) for s in data)
  272. word2idx = dict((w, i) for i, w in enumerate(numpy.array(vocab)))
  273. print(len(vocab))
  274. print(vocab.index("<unk>"))
  275. index_unk = word2idx.get("<unk>")
  276. index_pad = word2idx.get("<pad>")
  277. print("unk",index_unk,"pad",index_pad)
  278. x = [[word2idx.get(w[0], index_unk) for w in s] for s in data] # set to <unk> (index 1) if not in vocab
  279. x_len = [maxlen if len(s)>maxlen else len(s) for s in data]
  280. y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
  281. x = pad_sequences(x, maxlen,value=index_pad,padding='post',truncating='post') # left padding
  282. y_chunk = pad_sequences(y_chunk, maxlen, value=0,padding='post',truncating='post')
  283. # y_chunk = numpy.expand_dims(y_chunk, 2)
  284. return x, y_chunk,x_len
  285. if __name__=="__main__":
  286. '''
  287. importProjectData()
  288. '''
  289. # (train_x, train_y), (test_x, test_y), (vocab,class_labels),test = generateDatas(MAX_LEN=300)
  290. # print(len(train_x))
  291. # print(len(test_x))
  292. # print(class_labels)