12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- filenames = ["data/THU/train.conll", "data/THU/dev.conll"]
- def get_pos_relation(filenames):
- dict_pos = {}
- dict_pos2 = {}
- dict_relation = {}
- for filename in filenames:
- with open(filename,"r",encoding="utf8") as f:
- while 1:
- line = f.readline()
- if not line:
- break
- if line.strip()=="":
- continue
- id,form,lemma,cpostag,postag,feats,head,deprel = line.strip().split("\t")[:8]
- if cpostag not in dict_pos:
- dict_pos[cpostag] = 0
- dict_pos[cpostag] += 1
- if postag not in dict_pos2:
- dict_pos2[postag] = 0
- dict_pos2[postag] += 1
- if deprel not in dict_relation:
- dict_relation[deprel] = 0
- dict_relation[deprel] += 1
- list_cpostag = list(dict_pos.items())
- list_cpostag.sort(key=lambda x:x[1],reverse=True)
- list_postag = list(dict_pos2.items())
- list_postag.sort(key=lambda x:x[1],reverse=True)
- list_relation = list(dict_relation.items())
- list_relation.sort(key=lambda x:x[1],reverse=True)
- print(list_cpostag,len(list_cpostag))
- print(list_postag,len(list_postag))
- print(list_relation,len(list_relation))
- with open("cpostag.txt","w",encoding="utf8") as f:
- for k,v in list_cpostag:
- f.write("%s\t%s\n"%(k,v))
- with open("postag.txt","w",encoding="utf8") as f:
- for k,v in list_postag:
- f.write("%s\t%s\n"%(k,v))
- with open("relation.txt","w",encoding="utf8") as f:
- for k,v in list_relation:
- f.write("%s\t%s\n"%(k,v))
- def get_legal_postag(minnum=100):
- with open("postag.txt","r",encoding="utf8") as f:
- lines = f.readlines()
- legal_postag = []
- for line in lines:
- line = line.strip()
- k,v = line.split("\t")
- if int(v)>=minnum:
- legal_postag.append(k)
- return legal_postag
- def get_legal_relation(minnum=100):
- with open("relation.txt","r",encoding="utf8") as f:
- lines = f.readlines()
- legal_relation = []
- for line in lines:
- line = line.strip()
- k,v = line.split("\t")
- if int(v)>=minnum:
- legal_relation.append(k)
- return legal_relation
- if __name__ == '__main__':
- # get_pos_relation(filenames)
- get_legal_postag()
- get_legal_relation()
|