THU_preprocess.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. filenames = ["data/THU/train.conll", "data/THU/dev.conll"]
  2. def get_pos_relation(filenames):
  3. dict_pos = {}
  4. dict_pos2 = {}
  5. dict_relation = {}
  6. for filename in filenames:
  7. with open(filename,"r",encoding="utf8") as f:
  8. while 1:
  9. line = f.readline()
  10. if not line:
  11. break
  12. if line.strip()=="":
  13. continue
  14. id,form,lemma,cpostag,postag,feats,head,deprel = line.strip().split("\t")[:8]
  15. if cpostag not in dict_pos:
  16. dict_pos[cpostag] = 0
  17. dict_pos[cpostag] += 1
  18. if postag not in dict_pos2:
  19. dict_pos2[postag] = 0
  20. dict_pos2[postag] += 1
  21. if deprel not in dict_relation:
  22. dict_relation[deprel] = 0
  23. dict_relation[deprel] += 1
  24. list_cpostag = list(dict_pos.items())
  25. list_cpostag.sort(key=lambda x:x[1],reverse=True)
  26. list_postag = list(dict_pos2.items())
  27. list_postag.sort(key=lambda x:x[1],reverse=True)
  28. list_relation = list(dict_relation.items())
  29. list_relation.sort(key=lambda x:x[1],reverse=True)
  30. print(list_cpostag,len(list_cpostag))
  31. print(list_postag,len(list_postag))
  32. print(list_relation,len(list_relation))
  33. with open("cpostag.txt","w",encoding="utf8") as f:
  34. for k,v in list_cpostag:
  35. f.write("%s\t%s\n"%(k,v))
  36. with open("postag.txt","w",encoding="utf8") as f:
  37. for k,v in list_postag:
  38. f.write("%s\t%s\n"%(k,v))
  39. with open("relation.txt","w",encoding="utf8") as f:
  40. for k,v in list_relation:
  41. f.write("%s\t%s\n"%(k,v))
  42. def get_legal_postag(minnum=100):
  43. with open("postag.txt","r",encoding="utf8") as f:
  44. lines = f.readlines()
  45. legal_postag = []
  46. for line in lines:
  47. line = line.strip()
  48. k,v = line.split("\t")
  49. if int(v)>=minnum:
  50. legal_postag.append(k)
  51. return legal_postag
  52. def get_legal_relation(minnum=100):
  53. with open("relation.txt","r",encoding="utf8") as f:
  54. lines = f.readlines()
  55. legal_relation = []
  56. for line in lines:
  57. line = line.strip()
  58. k,v = line.split("\t")
  59. if int(v)>=minnum:
  60. legal_relation.append(k)
  61. return legal_relation
  62. if __name__ == '__main__':
  63. # get_pos_relation(filenames)
  64. get_legal_postag()
  65. get_legal_relation()